/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "util.h" #include "rspamd.h" #include "message.h" #include "html.h" #include "html_tags.h" #include "html_colors.h" #include "html_entities.h" #include "url.h" #include "contrib/libucl/khash.h" #include "libmime/images.h" #include <unicode/uversion.h> #include <unicode/ucnv.h> #if U_ICU_VERSION_MAJOR_NUM >= 46 #include <unicode/uidna.h> #endif static sig_atomic_t tags_sorted = 0; static sig_atomic_t entities_sorted = 0; static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */ struct html_tag_def { const gchar *name; gint16 id; guint16 len; guint flags; }; #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \ rspamd_html_log_id, "html", pool->tag.uid, \ G_STRFUNC, \ __VA_ARGS__) INIT_LOG_MODULE(html) #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)} static struct html_tag_def tag_defs[] = { /* W3C defined elements */ TAG_DEF(Tag_A, "a", FL_HREF), TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)), TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)), TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)), TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)), TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)), TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)), TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)), TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)), TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)), TAG_DEF(Tag_BIG, "big", (CM_INLINE)), TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)), TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)), TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)), TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)), TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)), TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)), TAG_DEF(Tag_CITE, "cite", (CM_INLINE)), TAG_DEF(Tag_CODE, "code", (CM_INLINE)), TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)), TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)), TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)), TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)), TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)), TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)), TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)), TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)), TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)), TAG_DEF(Tag_EM, "em", (CM_INLINE)), TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)), TAG_DEF(Tag_FONT, "font", (FL_BLOCK)), TAG_DEF(Tag_FORM, "form", (CM_BLOCK)), TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)), TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)), TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)), TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)), TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)), TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)), TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)), TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)), TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)), TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)), TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)), TAG_DEF(Tag_I, "i", (CM_INLINE)), TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)), TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)), TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)), TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)), TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)), TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)), TAG_DEF(Tag_LABEL, "label", (CM_INLINE)), TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)), TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)), TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY|FL_HREF)), TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)), TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)), TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)), TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)), TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)), TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)), TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)), TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)), TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)), TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)), TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)), TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)), TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)), TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)), TAG_DEF(Tag_Q, "q", (CM_INLINE)), TAG_DEF(Tag_RB, "rb", (CM_INLINE)), TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)), TAG_DEF(Tag_RP, "rp", (CM_INLINE)), TAG_DEF(Tag_RT, "rt", (CM_INLINE)), TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)), TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)), TAG_DEF(Tag_S, "s", (CM_INLINE)), TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)), TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)), TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)), TAG_DEF(Tag_SMALL, "small", (CM_INLINE)), TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)), TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)), TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)), TAG_DEF(Tag_STYLE, "style", (CM_HEAD)), TAG_DEF(Tag_SUB, "sub", (CM_INLINE)), TAG_DEF(Tag_SUP, "sup", (CM_INLINE)), TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)), TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)), TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)), TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)), TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)), TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)), TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)), TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)), TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)), TAG_DEF(Tag_TT, "tt", (CM_INLINE)), TAG_DEF(Tag_U, "u", (CM_INLINE)), TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)), TAG_DEF(Tag_VAR, "var", (CM_INLINE)), TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)), TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)), /* proprietary elements */ TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)), TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)), TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)), TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)), TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)), TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)), TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)), TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)), TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)), TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)), TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)), TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)), TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)), TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)), TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)), TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)), TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)), TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)), }; KHASH_MAP_INIT_INT (entity_by_number, const char *); KHASH_MAP_INIT_STR (entity_by_name, const char *); KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def); KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def); KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true, rspamd_ftok_icase_hash, rspamd_ftok_icase_equal); khash_t(entity_by_number) *html_entity_by_number; khash_t(entity_by_name) *html_entity_by_name; khash_t(tag_by_name) *html_tag_by_name; khash_t(tag_by_id) *html_tag_by_id; khash_t(color_by_name) *html_color_by_name; static void rspamd_html_library_init (void) { guint i; khiter_t k; gint rc; if (!tags_sorted) { html_tag_by_id = kh_init (tag_by_id); html_tag_by_name = kh_init (tag_by_name); kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs)); kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs)); for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) { k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc); kh_val (html_tag_by_id, k) = tag_defs[i]; k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc); kh_val (html_tag_by_name, k) = tag_defs[i]; } tags_sorted = 1; } if (!entities_sorted) { html_entity_by_number = kh_init (entity_by_number); html_entity_by_name = kh_init (entity_by_name); kh_resize (entity_by_number, html_entity_by_number, G_N_ELEMENTS (entities_defs)); kh_resize (entity_by_name, html_entity_by_name, G_N_ELEMENTS (entities_defs)); for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) { if (entities_defs[i].code != 0) { k = kh_put (entity_by_number, html_entity_by_number, entities_defs[i].code, &rc); kh_val (html_entity_by_number, k) = entities_defs[i].replacement; } k = kh_put (entity_by_name, html_entity_by_name, entities_defs[i].name, &rc); kh_val (html_entity_by_name, k) = entities_defs[i].replacement; } html_color_by_name = kh_init (color_by_name); kh_resize (color_by_name, html_color_by_name, G_N_ELEMENTS (html_colornames)); rspamd_ftok_t *keys; keys = g_malloc0 (sizeof (rspamd_ftok_t) * G_N_ELEMENTS (html_colornames)); for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) { struct html_color c; keys[i].begin = html_colornames[i].name; keys[i].len = strlen (html_colornames[i].name); k = kh_put (color_by_name, html_color_by_name, &keys[i], &rc); c.valid = true; c.d.comp.r = html_colornames[i].rgb.r; c.d.comp.g = html_colornames[i].rgb.g; c.d.comp.b = html_colornames[i].rgb.b; c.d.comp.alpha = 255; kh_val (html_color_by_name, k) = c; } entities_sorted = 1; } } static gboolean rspamd_html_check_balance (GNode * node, GNode ** cur_level) { struct html_tag *arg = node->data, *tmp; GNode *cur; if (arg->flags & FL_CLOSING) { /* First of all check whether this tag is closing tag for parent node */ cur = node->parent; while (cur && cur->data) { tmp = cur->data; if (tmp->id == arg->id && (tmp->flags & FL_CLOSED) == 0) { tmp->flags |= FL_CLOSED; /* Destroy current node as we find corresponding parent node */ g_node_destroy (node); /* Change level */ *cur_level = cur->parent; return TRUE; } cur = cur->parent; } } else { return TRUE; } return FALSE; } gint rspamd_html_tag_by_name (const gchar *name) { khiter_t k; k = kh_get (tag_by_name, html_tag_by_name, name); if (k != kh_end (html_tag_by_name)) { return kh_val (html_tag_by_name, k).id; } return -1; } gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname) { gint id; g_assert (hc != NULL); g_assert (hc->tags_seen != NULL); id = rspamd_html_tag_by_name (tagname); if (id != -1) { return isset (hc->tags_seen, id); } return FALSE; } const gchar * rspamd_html_tag_by_id (gint id) { khiter_t k; k = kh_get (tag_by_id, html_tag_by_id, id); if (k != kh_end (html_tag_by_id)) { return kh_val (html_tag_by_id, k).name; } return NULL; } /* Decode HTML entitles in text */ guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len) { goffset l, rep_len; gchar *t = s, *h = s, *e = s, *end_ptr, old_c; const gchar *end; const gchar *entity; gboolean seen_hash = FALSE, seen_digit_only = FALSE, seen_hex = FALSE; gint state = 0, base; UChar32 uc; khiter_t k; if (len == 0) { return 0; } else { l = len; } end = s + l; while (h - s < l) { switch (state) { /* Out of entity */ case 0: if (*h == '&') { state = 1; seen_hash = FALSE; seen_hex = FALSE; seen_digit_only = FALSE; e = h; h++; continue; } else { *t = *h; h++; t++; } break; case 1: if (*h == ';' && h > e) { decode_entity: /* Determine base */ /* First find in entities table */ old_c = *h; *h = '\0'; entity = e + 1; uc = 0; if (*entity != '#') { k = kh_get (entity_by_name, html_entity_by_name, entity); *h = old_c; if (k != kh_end (html_entity_by_name)) { if (kh_val (html_entity_by_name, k)) { rep_len = strlen (kh_val (html_entity_by_name, k)); if (end - t >= rep_len) { memcpy (t, kh_val (html_entity_by_name, k), rep_len); t += rep_len; } } else { if (end - t > h - e + 1) { memmove (t, e, h - e + 1); t += h - e + 1; } } } else { if (end - t > h - e + 1) { memmove (t, e, h - e + 1); t += h - e + 1; } } } else if (e + 2 < h) { if (*(e + 2) == 'x' || *(e + 2) == 'X') { base = 16; } else if (*(e + 2) == 'o' || *(e + 2) == 'O') { base = 8; } else { base = 10; } if (base == 10) { uc = strtoul ((e + 2), &end_ptr, base); } else { uc = strtoul ((e + 3), &end_ptr, base); } if (end_ptr != NULL && *end_ptr != '\0') { /* Skip undecoded */ *h = old_c; if (end - t > h - e + 1) { memmove (t, e, h - e + 1); t += h - e + 1; } } else { /* Search for a replacement */ *h = old_c; k = kh_get (entity_by_number, html_entity_by_number, uc); if (k != kh_end (html_entity_by_number)) { if (kh_val (html_entity_by_number, k)) { rep_len = strlen (kh_val (html_entity_by_number, k)); if (end - t >= rep_len) { memcpy (t, kh_val (html_entity_by_number, k), rep_len); t += rep_len; } } else { if (end - t > h - e + 1) { memmove (t, e, h - e + 1); t += h - e + 1; } } } else { /* Unicode point */ goffset off = t - s; UBool is_error = 0; if (uc > 0) { U8_APPEND (s, off, len, uc, is_error); if (!is_error) { t = s + off; } else { /* Leave invalid entities as is */ if (end - t > h - e + 1) { memmove (t, e, h - e + 1); t += h - e + 1; } } } else if (end - t > h - e + 1) { memmove (t, e, h - e + 1); t += h - e + 1; } } if (end - t > 0 && old_c != ';') { /* Fuck email clients, fuck them */ *t++ = old_c; } } } state = 0; } else if (*h == '&') { /* Previous `&` was bogus */ state = 1; if (end - t > h - e) { memmove (t, e, h - e); t += h - e; } e = h; } else if (*h == '#') { seen_hash = TRUE; if (h + 1 < end && h[1] == 'x') { seen_hex = TRUE; /* Skip one more character */ h ++; } } else if (g_ascii_isdigit (*h) || (seen_hex && g_ascii_isxdigit (*h))) { seen_digit_only = TRUE; } else { if (seen_digit_only && seen_hash && h > e) { /* We have seen some digits, so we can try to decode, eh */ /* Fuck retarded email clients... */ goto decode_entity; } seen_digit_only = FALSE; } h++; break; } } /* Leftover */ if (state == 1 && h > e) { /* Unfinished entity, copy as is */ if (end - t >= h - e) { memmove (t, e, h - e); t += h - e; } } return (t - s); } static gboolean rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2) { const gchar *p1, *p2; p1 = t1->begin + t1->len - 1; p2 = t2->begin + t2->len - 1; /* Skip trailing dots */ while (p1 > t1->begin) { if (*p1 != '.') { break; } p1 --; } while (p2 > t2->begin) { if (*p2 != '.') { break; } p2 --; } while (p1 > t1->begin && p2 > t2->begin) { if (*p1 != *p2) { break; } p1 --; p2 --; } if (p2 == t2->begin) { /* p2 can be subdomain of p1 if *p1 is '.' */ if (p1 != t1->begin && *(p1 - 1) == '.') { return TRUE; } } else if (p1 == t1->begin) { if (p2 != t2->begin && *(p2 - 1) == '.') { return TRUE; } } return FALSE; } static void rspamd_html_url_is_phished (rspamd_mempool_t *pool, struct rspamd_url *href_url, const guchar *url_text, gsize len, gboolean *url_found, struct rspamd_url **ptext_url) { struct rspamd_url *text_url; rspamd_ftok_t disp_tok, href_tok; gint rc; goffset url_pos; gchar *url_str = NULL, *idn_hbuf; const guchar *end = url_text + len, *p; #if U_ICU_VERSION_MAJOR_NUM >= 46 static UIDNA *udn; UErrorCode uc_err = U_ZERO_ERROR; UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER; #endif *url_found = FALSE; #if U_ICU_VERSION_MAJOR_NUM >= 46 if (udn == NULL) { udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err); if (uc_err != U_ZERO_ERROR) { msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err)); } } #endif while (url_text < end && g_ascii_isspace (*url_text)) { url_text ++; } if (end > url_text + 4 && rspamd_url_find (pool, url_text, end - url_text, &url_str, RSPAMD_URL_FIND_ALL, &url_pos, NULL) && url_str != NULL) { if (url_pos > 0) { /* * We have some url at some offset, so we need to check what is * at the start of the text */ p = url_text; while (p < url_text + url_pos) { if (!g_ascii_isspace (*p)) { *url_found = FALSE; return; } p++; } } text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool, RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK) { disp_tok.len = text_url->hostlen; disp_tok.begin = rspamd_url_host_unsafe (text_url); #if U_ICU_VERSION_MAJOR_NUM >= 46 if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url), text_url->hostlen, "xn--", 4) != -1) { idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1); /* We need to convert it to the normal value first */ disp_tok.len = uidna_nameToUnicodeUTF8 (udn, rspamd_url_host_unsafe (text_url), text_url->hostlen, idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err); if (uc_err != U_ZERO_ERROR) { msg_err_pool ("cannot convert to IDN: %s", u_errorName (uc_err)); disp_tok.len = text_url->hostlen; } else { disp_tok.begin = idn_hbuf; } } #endif href_tok.len = href_url->hostlen; href_tok.begin = rspamd_url_host_unsafe (href_url); #if U_ICU_VERSION_MAJOR_NUM >= 46 if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url), href_url->hostlen, "xn--", 4) != -1) { idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1); /* We need to convert it to the normal value first */ href_tok.len = uidna_nameToUnicodeUTF8 (udn, rspamd_url_host_unsafe (href_url), href_url->hostlen, idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err); if (uc_err != U_ZERO_ERROR) { msg_err_pool ("cannot convert to IDN: %s", u_errorName (uc_err)); href_tok.len = href_url->hostlen; } else { href_tok.begin = idn_hbuf; } } #endif if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 && text_url->tldlen > 0 && href_url->tldlen > 0) { /* Apply the same logic for TLD */ disp_tok.len = text_url->tldlen; disp_tok.begin = rspamd_url_tld_unsafe (text_url); #if U_ICU_VERSION_MAJOR_NUM >= 46 if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (text_url), text_url->tldlen, "xn--", 4) != -1) { idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1); /* We need to convert it to the normal value first */ disp_tok.len = uidna_nameToUnicodeUTF8 (udn, rspamd_url_tld_unsafe (text_url), text_url->tldlen, idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err); if (uc_err != U_ZERO_ERROR) { msg_err_pool ("cannot convert to IDN: %s", u_errorName (uc_err)); disp_tok.len = text_url->tldlen; } else { disp_tok.begin = idn_hbuf; } } #endif href_tok.len = href_url->tldlen; href_tok.begin = rspamd_url_tld_unsafe (href_url); #if U_ICU_VERSION_MAJOR_NUM >= 46 if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (href_url), href_url->tldlen, "xn--", 4) != -1) { idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1); /* We need to convert it to the normal value first */ href_tok.len = uidna_nameToUnicodeUTF8 (udn, rspamd_url_tld_unsafe (href_url), href_url->tldlen, idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err); if (uc_err != U_ZERO_ERROR) { msg_err_pool ("cannot convert to IDN: %s", u_errorName (uc_err)); href_tok.len = href_url->tldlen; } else { href_tok.begin = idn_hbuf; } } #endif if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) { /* Check if one url is a subdomain for another */ if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) { href_url->flags |= RSPAMD_URL_FLAG_PHISHED; href_url->phished_url = text_url; text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; } } } *ptext_url = text_url; *url_found = TRUE; } else { msg_info_pool ("extract of url '%s' failed: %s", url_str, rspamd_url_strerror (rc)); } } } static gboolean rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc, struct html_tag *tag, GNode **cur_level, gboolean *balanced) { GNode *nnode; struct html_tag *parent; if (hc->html_tags == NULL) { nnode = g_node_new (NULL); *cur_level = nnode; hc->html_tags = nnode; rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, nnode); } if (hc->total_tags > max_tags) { hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; } if (tag->id == -1) { /* Ignore unknown tags */ hc->total_tags ++; return FALSE; } tag->parent = *cur_level; if (!(tag->flags & CM_INLINE)) { /* Block tag */ if (tag->flags & (FL_CLOSING|FL_CLOSED)) { if (!*cur_level) { msg_debug_html ("bad parent node"); return FALSE; } if (hc->total_tags < max_tags) { nnode = g_node_new (tag); g_node_append (*cur_level, nnode); if (!rspamd_html_check_balance (nnode, cur_level)) { msg_debug_html ( "mark part as unbalanced as it has not pairable closing tags"); hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; *balanced = FALSE; } else { *balanced = TRUE; } hc->total_tags ++; } } else { parent = (*cur_level)->data; if (parent) { if ((parent->flags & FL_IGNORE)) { tag->flags |= FL_IGNORE; } if (!(tag->flags & FL_CLOSED) && !(parent->flags & FL_BLOCK)) { /* We likely have some bad nesting */ if (parent->id == tag->id) { /* Something like <a>bla<a>foo... */ hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; *balanced = FALSE; tag->parent = parent->parent; if (hc->total_tags < max_tags) { nnode = g_node_new (tag); g_node_append (parent->parent, nnode); *cur_level = nnode; hc->total_tags ++; } return TRUE; } } } if (hc->total_tags < max_tags) { nnode = g_node_new (tag); g_node_append (*cur_level, nnode); if ((tag->flags & FL_CLOSED) == 0) { *cur_level = nnode; } hc->total_tags ++; } if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) { tag->flags |= FL_IGNORE; return FALSE; } } } else { /* Inline tag */ parent = (*cur_level)->data; if (parent && (parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) { tag->flags |= FL_IGNORE; return FALSE; } } return TRUE; } #define NEW_COMPONENT(comp_type) do { \ comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \ comp->type = (comp_type); \ comp->start = NULL; \ comp->len = 0; \ g_queue_push_tail (tag->params, comp); \ ret = TRUE; \ } while(0) static gboolean rspamd_html_parse_tag_component (rspamd_mempool_t *pool, const guchar *begin, const guchar *end, struct html_tag *tag) { struct html_tag_component *comp; gint len; gboolean ret = FALSE; gchar *p; if (end <= begin) { return FALSE; } p = rspamd_mempool_alloc (pool, end - begin); memcpy (p, begin, end - begin); len = rspamd_html_decode_entitles_inplace (p, end - begin); if (len == 3) { if (g_ascii_strncasecmp (p, "src", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF); } } else if (len == 4) { if (g_ascii_strncasecmp (p, "href", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF); } } if (tag->id == Tag_IMG) { /* Check width and height if presented */ if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH); } else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT); } else if (g_ascii_strncasecmp (p, "style", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE); } } else if (tag->id == Tag_FONT) { if (len == 5){ if (g_ascii_strncasecmp (p, "color", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR); } else if (g_ascii_strncasecmp (p, "style", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE); } else if (g_ascii_strncasecmp (p, "class", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS); } } else if (len == 7) { if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR); } } else if (len == 4) { if (g_ascii_strncasecmp (p, "size", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE); } } } else if (tag->flags & FL_BLOCK) { if (len == 5){ if (g_ascii_strncasecmp (p, "color", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR); } else if (g_ascii_strncasecmp (p, "style", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE); } else if (g_ascii_strncasecmp (p, "class", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS); } } else if (len == 7) { if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) { NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR); } } } return ret; } static inline void rspamd_html_parse_tag_content (rspamd_mempool_t *pool, struct html_content *hc, struct html_tag *tag, const guchar *in, gint *statep, guchar const **savep) { enum { parse_start = 0, parse_name, parse_attr_name, parse_equal, parse_start_dquote, parse_dqvalue, parse_end_dquote, parse_start_squote, parse_sqvalue, parse_end_squote, parse_value, spaces_after_name, spaces_before_eq, spaces_after_eq, spaces_after_param, ignore_bad_tag } state; struct html_tag_def *found; gboolean store = FALSE; struct html_tag_component *comp; state = *statep; switch (state) { case parse_start: if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; state = ignore_bad_tag; tag->id = -1; tag->flags |= FL_BROKEN; } else if (g_ascii_isalpha (*in)) { state = parse_name; tag->name.start = in; } break; case parse_name: if (g_ascii_isspace (*in) || *in == '>' || *in == '/') { g_assert (in >= tag->name.start); if (*in == '/') { tag->flags |= FL_CLOSED; } tag->name.len = in - tag->name.start; if (tag->name.len == 0) { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; tag->id = -1; tag->flags |= FL_BROKEN; state = ignore_bad_tag; } else { gchar *s; khiter_t k; /* We CANNOT safely modify tag's name here, as it is already parsed */ s = rspamd_mempool_alloc (pool, tag->name.len + 1); memcpy (s, tag->name.start, tag->name.len); tag->name.len = rspamd_html_decode_entitles_inplace (s, tag->name.len); tag->name.start = s; tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len); s[tag->name.len] = '\0'; k = kh_get (tag_by_name, html_tag_by_name, s); if (k == kh_end (html_tag_by_name)) { hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS; tag->id = -1; } else { found = &kh_val (html_tag_by_name, k); tag->id = found->id; tag->flags = found->flags; } state = spaces_after_name; } } break; case parse_attr_name: if (*savep == NULL) { state = ignore_bad_tag; } else { const guchar *attr_name_end = in; if (*in == '=') { state = parse_equal; } else if (*in == '"') { /* No equal or something sane but we have quote character */ state = parse_start_dquote; attr_name_end = in - 1; while (attr_name_end > *savep) { if (!g_ascii_isalnum (*attr_name_end)) { attr_name_end --; } else { break; } } /* One character forward to obtain length */ attr_name_end ++; } else if (g_ascii_isspace (*in)) { state = spaces_before_eq; } else if (*in == '/') { tag->flags |= FL_CLOSED; } else if (!g_ascii_isgraph (*in)) { state = parse_value; attr_name_end = in - 1; while (attr_name_end > *savep) { if (!g_ascii_isalnum (*attr_name_end)) { attr_name_end --; } else { break; } } /* One character forward to obtain length */ attr_name_end ++; } else { return; } if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) { /* Ignore unknown params */ *savep = NULL; } else if (state == parse_value) { *savep = in + 1; } } break; case spaces_after_name: if (!g_ascii_isspace (*in)) { *savep = in; if (*in == '/') { tag->flags |= FL_CLOSED; } else if (*in != '>') { state = parse_attr_name; } } break; case spaces_before_eq: if (*in == '=') { state = parse_equal; } else if (!g_ascii_isspace (*in)) { /* * HTML defines that crap could still be restored and * calculated somehow... So we have to follow this stupid behaviour */ /* * TODO: estimate what insane things do email clients in each case */ if (*in == '>') { /* * Attribtute name followed by end of tag * Should be okay (empty attribute). The rest is handled outside * this automata. */ } else if (*in == '"' || *in == '\'') { /* Attribute followed by quote... Missing '=' ? Dunno, need to test */ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; tag->flags |= FL_BROKEN; state = ignore_bad_tag; } else { /* * Just start another attribute ignoring an empty attributes for * now. We don't use them in fact... */ state = parse_attr_name; *savep = in; } } break; case spaces_after_eq: if (*in == '"') { state = parse_start_dquote; } else if (*in == '\'') { state = parse_start_squote; } else if (!g_ascii_isspace (*in)) { if (*savep != NULL) { /* We need to save this param */ *savep = in; } state = parse_value; } break; case parse_equal: if (g_ascii_isspace (*in)) { state = spaces_after_eq; } else if (*in == '"') { state = parse_start_dquote; } else if (*in == '\'') { state = parse_start_squote; } else { if (*savep != NULL) { /* We need to save this param */ *savep = in; } state = parse_value; } break; case parse_start_dquote: if (*in == '"') { if (*savep != NULL) { /* We have an empty attribute value */ savep = NULL; } state = spaces_after_param; } else { if (*savep != NULL) { /* We need to save this param */ *savep = in; } state = parse_dqvalue; } break; case parse_start_squote: if (*in == '\'') { if (*savep != NULL) { /* We have an empty attribute value */ savep = NULL; } state = spaces_after_param; } else { if (*savep != NULL) { /* We need to save this param */ *savep = in; } state = parse_sqvalue; } break; case parse_dqvalue: if (*in == '"') { store = TRUE; state = parse_end_dquote; } if (store) { if (*savep != NULL) { gchar *s; g_assert (tag->params != NULL); comp = g_queue_peek_tail (tag->params); g_assert (comp != NULL); comp->len = in - *savep; s = rspamd_mempool_alloc (pool, comp->len); memcpy (s, *savep, comp->len); comp->len = rspamd_html_decode_entitles_inplace (s, comp->len); comp->start = s; *savep = NULL; } } break; case parse_sqvalue: if (*in == '\'') { store = TRUE; state = parse_end_squote; } if (store) { if (*savep != NULL) { gchar *s; g_assert (tag->params != NULL); comp = g_queue_peek_tail (tag->params); g_assert (comp != NULL); comp->len = in - *savep; s = rspamd_mempool_alloc (pool, comp->len); memcpy (s, *savep, comp->len); comp->len = rspamd_html_decode_entitles_inplace (s, comp->len); comp->start = s; *savep = NULL; } } break; case parse_value: if (*in == '/' && *(in + 1) == '>') { tag->flags |= FL_CLOSED; store = TRUE; } else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') { store = TRUE; state = spaces_after_param; } if (store) { if (*savep != NULL) { gchar *s; g_assert (tag->params != NULL); comp = g_queue_peek_tail (tag->params); g_assert (comp != NULL); comp->len = in - *savep; s = rspamd_mempool_alloc (pool, comp->len); memcpy (s, *savep, comp->len); comp->len = rspamd_html_decode_entitles_inplace (s, comp->len); comp->start = s; *savep = NULL; } } break; case parse_end_dquote: case parse_end_squote: if (g_ascii_isspace (*in)) { state = spaces_after_param; } else if (*in == '/' && *(in + 1) == '>') { tag->flags |= FL_CLOSED; } else { /* No space, proceed immediately to the attribute name */ state = parse_attr_name; *savep = in; } break; case spaces_after_param: if (!g_ascii_isspace (*in)) { if (*in == '/' && *(in + 1) == '>') { tag->flags |= FL_CLOSED; } state = parse_attr_name; *savep = in; } break; case ignore_bad_tag: break; } *statep = state; } struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, struct html_tag_component *comp) { struct rspamd_url *url; guint saved_flags = 0; gchar *decoded; gint rc; gsize decoded_len; const gchar *p, *s, *prefix = "http://"; gchar *d; guint i, dlen; gboolean has_bad_chars = FALSE, no_prefix = FALSE; static const gchar hexdigests[16] = "0123456789abcdef"; p = start; /* Strip spaces from the url */ /* Head spaces */ while (p < start + len && g_ascii_isspace (*p)) { p ++; start ++; len --; } if (comp) { comp->start = p; comp->len = len; } /* Trailing spaces */ p = start + len - 1; while (p >= start && g_ascii_isspace (*p)) { p --; len --; if (comp) { comp->len --; } } s = start; dlen = 0; for (i = 0; i < len; i ++) { if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) { dlen += 3; } else { dlen ++; } } if (rspamd_substring_search (start, len, "://", 3) == -1) { if (len >= sizeof ("mailto:") && (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 || memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 || memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) { /* Exclusion, has valid but 'strange' prefix */ } else { for (i = 0; i < len; i ++) { if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) { if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') { prefix = "http:"; dlen += sizeof ("http:") - 1; no_prefix = TRUE; } else if (s[i] == '@') { /* Likely email prefix */ prefix = "mailto://"; dlen += sizeof ("mailto://") - 1; no_prefix = TRUE; } else if (s[i] == ':' && i != 0) { /* Special case */ no_prefix = FALSE; } else { if (i == 0) { /* No valid data */ return NULL; } else { no_prefix = TRUE; dlen += strlen (prefix); } } break; } } } } decoded = rspamd_mempool_alloc (pool, dlen + 1); d = decoded; if (no_prefix) { gsize plen = strlen (prefix); memcpy (d, prefix, plen); d += plen; } /* * We also need to remove all internal newlines, spaces * and encode unsafe characters */ for (i = 0; i < len; i ++) { if (G_UNLIKELY (g_ascii_isspace (s[i]))) { continue; } else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) { /* URL encode */ *d++ = '%'; *d++ = hexdigests[(s[i] >> 4) & 0xf]; *d++ = hexdigests[s[i] & 0xf]; has_bad_chars = TRUE; } else { *d++ = s[i]; } } *d = '\0'; dlen = d - decoded; url = rspamd_mempool_alloc0 (pool, sizeof (*url)); enum rspamd_normalise_result norm_res; norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen); if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED; } if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) { saved_flags |= RSPAMD_URL_FLAG_OBSCURED; if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES; } } rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); /* Filter some completely damaged urls */ if (rc == URI_ERRNO_OK && url->hostlen > 0 && !((url->flags & RSPAMD_URL_FLAG_OBSCURED) && (url->protocol & PROTOCOL_UNKNOWN))) { url->flags |= saved_flags; if (has_bad_chars) { url->flags |= RSPAMD_URL_FLAG_OBSCURED; } if (no_prefix) { url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; } decoded = url->string; decoded_len = url->urllen; if (comp) { comp->start = decoded; comp->len = decoded_len; } /* Spaces in href usually mean an attempt to obfuscate URL */ /* See https://github.com/vstakhov/rspamd/issues/593 */ #if 0 if (has_spaces) { url->flags |= RSPAMD_URL_FLAG_OBSCURED; } #endif return url; } return NULL; } static struct rspamd_url * rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) { struct html_tag_component *comp; GList *cur; struct rspamd_url *url; const gchar *start; gsize len; cur = tag->params->head; while (cur) { comp = cur->data; if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { start = comp->start; len = comp->len; /* Check base url */ if (hc && hc->base_url && comp->len > 2) { /* * Relative url canot start from the following: * schema:// * slash */ gchar *buf; gsize orig_len; if (rspamd_substring_search (start, len, "://", 3) == -1) { /* Assume relative url */ gboolean need_slash = FALSE; orig_len = len; len += hc->base_url->urllen; if (hc->base_url->datalen == 0) { need_slash = TRUE; len ++; } buf = rspamd_mempool_alloc (pool, len + 1); rspamd_snprintf (buf, len + 1, "%*s%s%*s", hc->base_url->urllen, hc->base_url->string, need_slash ? "/" : "", (gint)orig_len, start); start = buf; } else if (start[0] == '/' && start[1] != '/') { /* Relative to the hostname */ orig_len = len; len += hc->base_url->hostlen + hc->base_url->protocollen + 3 /* for :// */; buf = rspamd_mempool_alloc (pool, len + 1); rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s", hc->base_url->protocollen, hc->base_url->string, hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url), (gint)orig_len, start); start = buf; } } url = rspamd_html_process_url (pool, start, len, comp); if (url && tag->extra == NULL) { tag->extra = url; } return url; } cur = g_list_next (cur); } return NULL; } static void rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, khash_t (rspamd_url_hash) *url_set) { struct rspamd_url *query_url; gchar *url_str; gint rc; gboolean prefix_added; if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) { url->flags |= RSPAMD_URL_FLAG_OBSCURED; } if (url->querylen > 0) { if (rspamd_url_find (pool, rspamd_url_query_unsafe (url), url->querylen, &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); rc = rspamd_url_parse (query_url, url_str, strlen (url_str), pool, RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && query_url->hostlen > 0) { msg_debug_html ("found url %s in query of url" " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url)); if (prefix_added) { query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; } if (query_url->flags & (RSPAMD_URL_FLAG_UNNORMALISED|RSPAMD_URL_FLAG_OBSCURED| RSPAMD_URL_FLAG_NUMERIC)) { /* Set obscured flag if query url is bad */ url->flags |= RSPAMD_URL_FLAG_OBSCURED; } /* And vice-versa */ if (url->flags & RSPAMD_URL_FLAG_OBSCURED) { query_url->flags |= RSPAMD_URL_FLAG_OBSCURED; } rspamd_url_set_add_or_increase (url_set, query_url); } } } } static void rspamd_html_process_data_image (rspamd_mempool_t *pool, struct html_image *img, struct html_tag_component *src) { /* * Here, we do very basic processing of the data: * detect if we have something like: `` * We only parse base64 encoded data. * We ignore content type so far */ struct rspamd_image *parsed_image; const gchar *semicolon_pos = NULL, *end = src->start + src->len; semicolon_pos = src->start; while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) { if (end - semicolon_pos > sizeof ("base64,")) { if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) { const gchar *data_pos = semicolon_pos + sizeof ("base64,"); gchar *decoded; gsize encoded_len = end - data_pos, decoded_len; rspamd_ftok_t inp; decoded_len = (encoded_len / 4 * 3) + 12; decoded = rspamd_mempool_alloc (pool, decoded_len); rspamd_cryptobox_base64_decode (data_pos, encoded_len, decoded, &decoded_len); inp.begin = decoded; inp.len = decoded_len; parsed_image = rspamd_maybe_process_image (pool, &inp); if (parsed_image) { msg_debug_html ("detected %s image of size %ud x %ud in data url", rspamd_image_type_str (parsed_image->type), parsed_image->width, parsed_image->height); img->embedded_image = parsed_image; } } break; } else { /* Nothing useful */ return; } semicolon_pos ++; } } static void rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc, khash_t (rspamd_url_hash) *url_set) { struct html_tag_component *comp; struct html_image *img; rspamd_ftok_t fstr; const guchar *p; GList *cur; gulong val; gboolean seen_width = FALSE, seen_height = FALSE; goffset pos; cur = tag->params->head; img = rspamd_mempool_alloc0 (pool, sizeof (*img)); img->tag = tag; while (cur) { comp = cur->data; if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { fstr.begin = (gchar *)comp->start; fstr.len = comp->len; img->src = rspamd_mempool_ftokdup (pool, &fstr); if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start, "cid:", sizeof ("cid:") - 1) == 0) { /* We have an embedded image */ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; } else { if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start, "data:", sizeof ("data:") - 1) == 0) { /* We have an embedded image in HTML tag */ img->flags |= (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); rspamd_html_process_data_image (pool, img, comp); hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; } else { img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; if (img->src) { img->url = rspamd_html_process_url (pool, img->src, fstr.len, NULL); if (img->url) { img->url->flags |= RSPAMD_URL_FLAG_IMAGE; rspamd_url_set_add_or_increase (url_set, img->url); } } } } } else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) { rspamd_strtoul (comp->start, comp->len, &val); img->height = val; seen_height = TRUE; } else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) { rspamd_strtoul (comp->start, comp->len, &val); img->width = val; seen_width = TRUE; } else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) { /* Try to search for height= or width= in style tag */ if (!seen_height && comp->len > 0) { pos = rspamd_substring_search_caseless (comp->start, comp->len, "height", sizeof ("height") - 1); if (pos != -1) { p = comp->start + pos + sizeof ("height") - 1; while (p < comp->start + comp->len) { if (g_ascii_isdigit (*p)) { rspamd_strtoul (p, comp->len - (p - comp->start), &val); img->height = val; break; } else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') { /* Fallback */ break; } p ++; } } } if (!seen_width && comp->len > 0) { pos = rspamd_substring_search_caseless (comp->start, comp->len, "width", sizeof ("width") - 1); if (pos != -1) { p = comp->start + pos + sizeof ("width") - 1; while (p < comp->start + comp->len) { if (g_ascii_isdigit (*p)) { rspamd_strtoul (p, comp->len - (p - comp->start), &val); img->width = val; break; } else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') { /* Fallback */ break; } p ++; } } } } cur = g_list_next (cur); } if (hc->images == NULL) { hc->images = g_ptr_array_sized_new (4); rspamd_mempool_notify_alloc (pool, 4 * sizeof (gpointer) + sizeof (GPtrArray)); rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard, hc->images); } if (img->embedded_image) { if (!seen_height) { img->height = img->embedded_image->height; } if (!seen_width) { img->width = img->embedded_image->width; } } g_ptr_array_add (hc->images, img); tag->extra = img; } static void rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl) { const gchar *p = line, *end = line + len; char hexbuf[7]; rspamd_ftok_t search; struct html_color *el; memset (cl, 0, sizeof (*cl)); if (*p == '#') { /* HEX color */ p ++; rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1)); cl->d.val = strtoul (hexbuf, NULL, 16); cl->d.comp.alpha = 255; cl->valid = TRUE; } else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) { /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */ enum { obrace, num1, num2, num3, num4, skip_spaces } state = skip_spaces, next_state = obrace; gulong r = 0, g = 0, b = 0, opacity = 255; const gchar *c; gboolean valid = FALSE; p += 3; if (*p == 'a') { p ++; } c = p; while (p < end) { switch (state) { case obrace: if (*p == '(') { p ++; state = skip_spaces; next_state = num1; } else if (g_ascii_isspace (*p)) { state = skip_spaces; next_state = obrace; } else { goto stop; } break; case num1: if (*p == ',') { if (!rspamd_strtoul (c, p - c, &r)) { goto stop; } p ++; state = skip_spaces; next_state = num2; } else if (!g_ascii_isdigit (*p)) { goto stop; } else { p ++; } break; case num2: if (*p == ',') { if (!rspamd_strtoul (c, p - c, &g)) { goto stop; } p ++; state = skip_spaces; next_state = num3; } else if (!g_ascii_isdigit (*p)) { goto stop; } else { p ++; } break; case num3: if (*p == ',') { if (!rspamd_strtoul (c, p - c, &b)) { goto stop; } valid = TRUE; p ++; state = skip_spaces; next_state = num4; } else if (*p == ')') { if (!rspamd_strtoul (c, p - c, &b)) { goto stop; } valid = TRUE; goto stop; } else if (!g_ascii_isdigit (*p)) { goto stop; } else { p ++; } break; case num4: if (*p == ',') { if (!rspamd_strtoul (c, p - c, &opacity)) { goto stop; } valid = TRUE; goto stop; } else if (*p == ')') { if (!rspamd_strtoul (c, p - c, &opacity)) { goto stop; } valid = TRUE; goto stop; } else if (!g_ascii_isdigit (*p)) { goto stop; } else { p ++; } break; case skip_spaces: if (!g_ascii_isspace (*p)) { c = p; state = next_state; } else { p ++; } break; } } stop: if (valid) { cl->d.comp.r = r; cl->d.comp.g = g; cl->d.comp.b = b; cl->d.comp.alpha = opacity; cl->valid = TRUE; } } else { khiter_t k; /* Compare color by name */ search.begin = line; search.len = len; k = kh_get (color_by_name, html_color_by_name, &search); if (k != kh_end (html_color_by_name)) { el = &kh_val (html_color_by_name, k); memcpy (cl, el, sizeof (*cl)); cl->d.comp.alpha = 255; /* Non transparent */ } } } /* * Target is used for in and out if this function returns TRUE */ static gboolean rspamd_html_process_css_size (const gchar *suffix, gsize len, gdouble *tgt) { gdouble sz = *tgt; gboolean ret = FALSE; if (len >= 2) { if (memcmp (suffix, "px", 2) == 0) { sz = (guint) sz; /* Round to number */ ret = TRUE; } else if (memcmp (suffix, "em", 2) == 0) { /* EM is 16 px, so multiply and round */ sz = (guint) (sz * 16.0); ret = TRUE; } else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) { /* equal to EM in our case */ sz = (guint) (sz * 16.0); ret = TRUE; } else if (memcmp (suffix, "ex", 2) == 0) { /* * Represents the x-height of the element's font. * On fonts with the "x" letter, this is generally the height * of lowercase letters in the font; 1ex = 0.5em in many fonts. */ sz = (guint) (sz * 8.0); ret = TRUE; } else if (memcmp (suffix, "vw", 2) == 0) { /* * Vewport width in percentages: * we assume 1% of viewport width as 8px */ sz = (guint) (sz * 8.0); ret = TRUE; } else if (memcmp (suffix, "vh", 2) == 0) { /* * Vewport height in percentages * we assume 1% of viewport width as 6px */ sz = (guint) (sz * 6.0); ret = TRUE; } else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) { /* * Vewport width in percentages * we assume 1% of viewport width as 6px */ sz = (guint) (sz * 8.0); ret = TRUE; } else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) { /* * Vewport height in percentages * we assume 1% of viewport width as 6px */ sz = (guint) (sz * 6.0); ret = TRUE; } else if (memcmp (suffix, "pt", 2) == 0) { sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */ ret = TRUE; } else if (memcmp (suffix, "cm", 2) == 0) { sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */ ret = TRUE; } else if (memcmp (suffix, "mm", 2) == 0) { sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */ ret = TRUE; } else if (memcmp (suffix, "in", 2) == 0) { sz = (guint) (sz * 96.0); /* 96px */ ret = TRUE; } else if (memcmp (suffix, "pc", 2) == 0) { sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */ ret = TRUE; } } else if (suffix[0] == '%') { /* Percentages from 16 px */ sz = (guint)(sz / 100.0 * 16.0); ret = TRUE; } if (ret) { *tgt = sz; } return ret; } static void rspamd_html_process_font_size (const gchar *line, guint len, guint *fs, gboolean is_css) { const gchar *p = line, *end = line + len; gchar *err = NULL, numbuf[64]; gdouble sz = 0; gboolean failsafe = FALSE; while (p < end && g_ascii_isspace (*p)) { p ++; len --; } if (g_ascii_isdigit (*p)) { rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1)); sz = strtod (numbuf, &err); /* Now check leftover */ if (sz < 0) { sz = 0; } } else { /* Ignore the rest */ failsafe = TRUE; sz = is_css ? 16 : 1; /* TODO: add textual fonts descriptions */ } if (err && *err != '\0') { const gchar *e = err; gsize slen; /* Skip spaces */ while (*e && g_ascii_isspace (*e)) { e ++; } /* Lowercase */ slen = strlen (e); rspamd_str_lc ((gchar *)e, slen); if (!rspamd_html_process_css_size (e, slen, &sz)) { failsafe = TRUE; } } else { /* Failsafe naked number */ failsafe = TRUE; } if (failsafe) { if (is_css) { /* * In css mode we usually ignore sizes, but let's treat * small sizes specially */ if (sz < 1) { sz = 0; } else { sz = 16; /* Ignore */ } } else { /* In non-css mode we have to check legacy size */ sz = sz >= 1 ? sz * 16 : 16; } } if (sz > 32) { sz = 32; } *fs = sz; } static void rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl, struct html_content *hc, const gchar *style, guint len) { const gchar *p, *c, *end, *key = NULL; enum { read_key, read_colon, read_value, skip_spaces, } state = skip_spaces, next_state = read_key; guint klen = 0; gdouble opacity = 1.0; p = style; c = p; end = p + len; while (p <= end) { switch(state) { case read_key: if (p == end || *p == ':') { key = c; klen = p - c; state = skip_spaces; next_state = read_value; } else if (g_ascii_isspace (*p)) { key = c; klen = p - c; state = skip_spaces; next_state = read_colon; } p ++; break; case read_colon: if (p == end || *p == ':') { state = skip_spaces; next_state = read_value; } p ++; break; case read_value: if (p == end || *p == ';') { if (key && klen && p - c > 0) { if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0) || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) { rspamd_html_process_color (c, p - c, &bl->font_color); msg_debug_html ("got color: %xd", bl->font_color.d.val); } else if ((klen == 16 && g_ascii_strncasecmp (key, "background-color", 16) == 0) || (klen == 10 && g_ascii_strncasecmp (key, "background", 10) == 0)) { rspamd_html_process_color (c, p - c, &bl->background_color); msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val); } else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) { if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c, "none", 4) != -1) { bl->visible = FALSE; msg_debug_html ("tag is not visible"); } } else if (klen == 9 && g_ascii_strncasecmp (key, "font-size", 9) == 0) { rspamd_html_process_font_size (c, p - c, &bl->font_size, TRUE); msg_debug_html ("got font size: %ud", bl->font_size); } else if (klen == 7 && g_ascii_strncasecmp (key, "opacity", 7) == 0) { gchar numbuf[64]; rspamd_strlcpy (numbuf, c, MIN (sizeof (numbuf), p - c + 1)); opacity = strtod (numbuf, NULL); if (opacity > 1) { opacity = 1; } else if (opacity < 0) { opacity = 0; } bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0); } else if (klen == 10 && g_ascii_strncasecmp (key, "visibility", 10) == 0) { if (p - c >= 6 && rspamd_substring_search_caseless (c, p - c, "hidden", 6) != -1) { bl->visible = FALSE; msg_debug_html ("tag is not visible"); } } } key = NULL; klen = 0; state = skip_spaces; next_state = read_key; } p ++; break; case skip_spaces: if (p < end && !g_ascii_isspace (*p)) { c = p; state = next_state; } else { p ++; } break; } } } static void rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) { struct html_tag_component *comp; struct html_block *bl; rspamd_ftok_t fstr; GList *cur; cur = tag->params->head; bl = rspamd_mempool_alloc0 (pool, sizeof (*bl)); bl->tag = tag; bl->visible = TRUE; bl->font_size = (guint)-1; bl->font_color.d.comp.alpha = 255; while (cur) { comp = cur->data; if (comp->len > 0) { switch (comp->type) { case RSPAMD_HTML_COMPONENT_COLOR: fstr.begin = (gchar *) comp->start; fstr.len = comp->len; rspamd_html_process_color (comp->start, comp->len, &bl->font_color); msg_debug_html ("tag %*s; got color: %xd", tag->name.len, tag->name.start, bl->font_color.d.val); break; case RSPAMD_HTML_COMPONENT_BGCOLOR: fstr.begin = (gchar *) comp->start; fstr.len = comp->len; rspamd_html_process_color (comp->start, comp->len, &bl->background_color); msg_debug_html ("tag %*s; got color: %xd", tag->name.len, tag->name.start, bl->font_color.d.val); if (tag->id == Tag_BODY) { /* Set global background color */ memcpy (&hc->bgcolor, &bl->background_color, sizeof (hc->bgcolor)); } break; case RSPAMD_HTML_COMPONENT_STYLE: bl->style.len = comp->len; bl->style.start = comp->start; msg_debug_html ("tag: %*s; got style: %*s", tag->name.len, tag->name.start, (gint) bl->style.len, bl->style.start); rspamd_html_process_style (pool, bl, hc, comp->start, comp->len); break; case RSPAMD_HTML_COMPONENT_CLASS: fstr.begin = (gchar *) comp->start; fstr.len = comp->len; bl->html_class = rspamd_mempool_ftokdup (pool, &fstr); msg_debug_html ("tag: %*s; got class: %s", tag->name.len, tag->name.start, bl->html_class); break; case RSPAMD_HTML_COMPONENT_SIZE: /* Not supported by html5 */ /* FIXME maybe support it */ bl->font_size = 16; msg_debug_html ("tag %*s; got size: %*s", tag->name.len, tag->name.start, (gint)comp->len, comp->start); break; default: /* NYI */ break; } } cur = g_list_next (cur); } if (hc->blocks == NULL) { hc->blocks = g_ptr_array_sized_new (64); rspamd_mempool_notify_alloc (pool, 64 * sizeof (gpointer) + sizeof (GPtrArray)); rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard, hc->blocks); } g_ptr_array_add (hc->blocks, bl); tag->extra = bl; } static void rspamd_html_check_displayed_url (rspamd_mempool_t *pool, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GByteArray *dest, gint href_offset, struct rspamd_url *url) { struct rspamd_url *displayed_url = NULL; struct rspamd_url *turl; gboolean url_found = FALSE; struct rspamd_process_exception *ex; if (href_offset <= 0) { /* No dispalyed url, just some text within <a> tag */ return; } url->visible_part = rspamd_mempool_alloc (pool, dest->len - href_offset + 1); rspamd_strlcpy (url->visible_part, dest->data + href_offset, dest->len - href_offset + 1); g_strstrip (url->visible_part); rspamd_html_url_is_phished (pool, url, dest->data + href_offset, dest->len - href_offset, &url_found, &displayed_url); if (url_found) { url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL; } if (exceptions && url_found) { ex = rspamd_mempool_alloc (pool, sizeof (*ex)); ex->pos = href_offset; ex->len = dest->len - href_offset; ex->type = RSPAMD_EXCEPTION_URL; ex->ptr = url; *exceptions = g_list_prepend (*exceptions, ex); } if (displayed_url && url_set) { turl = rspamd_url_set_add_or_return (url_set, displayed_url); if (turl != NULL) { /* Here, we assume the following: * if we have a URL in the text part which * is the same as displayed URL in the * HTML part, we assume that it is also * hint only. */ if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) { turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; } turl->count ++; } else { /* Already inserted by `rspamd_url_set_add_or_return` */ } } } static gboolean rspamd_html_propagate_lengths (GNode *node, gpointer _unused) { GNode *child; struct html_tag *tag = node->data, *cld_tag; if (tag) { child = node->children; /* Summarize content length from children */ while (child) { cld_tag = child->data; tag->content_length += cld_tag->content_length; child = child->next; } } return FALSE; } static void rspamd_html_propagate_style (struct html_content *hc, struct html_tag *tag, struct html_block *bl, GQueue *blocks) { struct html_block *bl_parent; gboolean push_block = FALSE; /* Propagate from the parent if needed */ bl_parent = g_queue_peek_tail (blocks); if (bl_parent) { if (!bl->background_color.valid) { /* Try to propagate background color from parent nodes */ if (bl_parent->background_color.valid) { memcpy (&bl->background_color, &bl_parent->background_color, sizeof (bl->background_color)); } } else { push_block = TRUE; } if (!bl->font_color.valid) { /* Try to propagate background color from parent nodes */ if (bl_parent->font_color.valid) { memcpy (&bl->font_color, &bl_parent->font_color, sizeof (bl->font_color)); } } else { push_block = TRUE; } /* Propagate font size */ if (bl->font_size == (guint)-1) { if (bl_parent->font_size != (guint)-1) { bl->font_size = bl_parent->font_size; } } else { push_block = TRUE; } } /* Set bgcolor to the html bgcolor and font color to black as a last resort */ if (!bl->font_color.valid) { /* Don't touch opacity as it can be set separately */ bl->font_color.d.comp.r = 0; bl->font_color.d.comp.g = 0; bl->font_color.d.comp.b = 0; bl->font_color.valid = TRUE; } else { push_block = TRUE; } if (!bl->background_color.valid) { memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor)); } else { push_block = TRUE; } if (bl->font_size == (guint)-1) { bl->font_size = 16; /* Default for browsers */ } else { push_block = TRUE; } if (push_block && !(tag->flags & FL_CLOSED)) { g_queue_push_tail (blocks, bl); } } GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in, GList **exceptions, khash_t (rspamd_url_hash) *url_set) { const guchar *p, *c, *end, *savep = NULL; guchar t; gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE, balanced; GByteArray *dest; guint obrace = 0, ebrace = 0; GNode *cur_level = NULL; gint substate = 0, len, href_offset = -1; struct html_tag *cur_tag = NULL, *content_tag = NULL; struct rspamd_url *url = NULL; GQueue *styles_blocks; enum { parse_start = 0, tag_begin, sgml_tag, xml_tag, compound_tag, comment_tag, comment_content, sgml_content, tag_content, tag_end, xml_tag_end, content_ignore, content_write, content_ignore_sp } state = parse_start; g_assert (in != NULL); g_assert (hc != NULL); g_assert (pool != NULL); rspamd_html_library_init (); hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (N_TAGS)); /* Set white background color by default */ hc->bgcolor.d.comp.alpha = 0; hc->bgcolor.d.comp.r = 255; hc->bgcolor.d.comp.g = 255; hc->bgcolor.d.comp.b = 255; hc->bgcolor.valid = TRUE; dest = g_byte_array_sized_new (in->len / 3 * 2); styles_blocks = g_queue_new (); p = in->data; c = p; end = p + in->len; while (p < end) { t = *p; switch (state) { case parse_start: if (t == '<') { state = tag_begin; } else { /* We have no starting tag, so assume that it's content */ hc->flags |= RSPAMD_HTML_FLAG_BAD_START; state = content_write; } break; case tag_begin: switch (t) { case '<': p ++; closing = FALSE; break; case '!': state = sgml_tag; p ++; break; case '?': state = xml_tag; hc->flags |= RSPAMD_HTML_FLAG_XML; p ++; break; case '/': closing = TRUE; p ++; break; case '>': /* Empty tag */ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; state = tag_end; continue; default: state = tag_content; substate = 0; savep = NULL; cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag)); cur_tag->params = g_queue_new (); rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params); break; } break; case sgml_tag: switch (t) { case '[': state = compound_tag; obrace = 1; ebrace = 0; p ++; break; case '-': state = comment_tag; p ++; break; default: state = sgml_content; break; } break; case xml_tag: if (t == '?') { state = xml_tag_end; } else if (t == '>') { /* Misformed xml tag */ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; state = tag_end; continue; } /* We efficiently ignore xml tags */ p ++; break; case xml_tag_end: if (t == '>') { state = tag_end; continue; } else { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; p ++; } break; case compound_tag: if (t == '[') { obrace ++; } else if (t == ']') { ebrace ++; } else if (t == '>' && obrace == ebrace) { state = tag_end; continue; } p ++; break; case comment_tag: if (t != '-') { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; state = tag_end; } else { p++; ebrace = 0; /* * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments * ... the text must not start with a single * U+003E GREATER-THAN SIGN character (>), * nor start with a "-" (U+002D) character followed by * a U+003E GREATER-THAN SIGN (>) character, * nor contain two consecutive U+002D HYPHEN-MINUS * characters (--), nor end with a "-" (U+002D) character. */ if (p[0] == '-' && p + 1 < end && p[1] == '>') { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; p ++; state = tag_end; } else if (*p == '>') { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; state = tag_end; } else { state = comment_content; } } break; case comment_content: if (t == '-') { ebrace ++; } else if (t == '>' && ebrace >= 2) { state = tag_end; continue; } else { ebrace = 0; } p ++; break; case content_ignore: if (t != '<') { p ++; } else { state = tag_begin; } break; case content_write: if (t != '<') { if (t == '&') { need_decode = TRUE; } else if (g_ascii_isspace (t)) { save_space = TRUE; if (p > c) { if (need_decode) { goffset old_offset = dest->len; if (content_tag) { if (content_tag->content_length == 0) { content_tag->content_offset = old_offset; } } g_byte_array_append (dest, c, (p - c)); len = rspamd_html_decode_entitles_inplace ( dest->data + old_offset, p - c); dest->len = dest->len + len - (p - c); if (content_tag) { content_tag->content_length += len; } } else { len = p - c; if (content_tag) { if (content_tag->content_length == 0) { content_tag->content_offset = dest->len; } content_tag->content_length += len; } g_byte_array_append (dest, c, len); } } c = p; state = content_ignore_sp; } else { if (save_space) { /* Append one space if needed */ if (dest->len > 0 && !g_ascii_isspace (dest->data[dest->len - 1])) { g_byte_array_append (dest, " ", 1); if (content_tag) { if (content_tag->content_length == 0) { /* * Special case * we have a space at the beginning but * we have no set content_offset * so we need to do it here */ content_tag->content_offset = dest->len; } else { content_tag->content_length++; } } } save_space = FALSE; } } } else { if (c != p) { if (need_decode) { goffset old_offset = dest->len; if (content_tag) { if (content_tag->content_length == 0) { content_tag->content_offset = dest->len; } } g_byte_array_append (dest, c, (p - c)); len = rspamd_html_decode_entitles_inplace ( dest->data + old_offset, p - c); dest->len = dest->len + len - (p - c); if (content_tag) { content_tag->content_length += len; } } else { len = p - c; if (content_tag) { if (content_tag->content_length == 0) { content_tag->content_offset = dest->len; } content_tag->content_length += len; } g_byte_array_append (dest, c, len); } } content_tag = NULL; state = tag_begin; continue; } p ++; break; case content_ignore_sp: if (!g_ascii_isspace (t)) { c = p; state = content_write; continue; } p ++; break; case sgml_content: /* TODO: parse DOCTYPE here */ if (t == '>') { state = tag_end; /* We don't know a lot about sgml tags, ignore them */ cur_tag = NULL; continue; } p ++; break; case tag_content: rspamd_html_parse_tag_content (pool, hc, cur_tag, p, &substate, &savep); if (t == '>') { if (closing) { cur_tag->flags |= FL_CLOSING; if (cur_tag->flags & FL_CLOSED) { /* Bad mix of closed and closing */ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; } closing = FALSE; } state = tag_end; continue; } p ++; break; case tag_end: substate = 0; savep = NULL; if (cur_tag != NULL) { balanced = TRUE; if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level, &balanced)) { state = content_write; need_decode = FALSE; } else { state = content_ignore; } if (cur_tag->id != -1 && cur_tag->id < N_TAGS) { if (cur_tag->flags & CM_UNIQUE) { if (isset (hc->tags_seen, cur_tag->id)) { /* Duplicate tag has been found */ hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS; } } setbit (hc->tags_seen, cur_tag->id); } if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) { content_tag = cur_tag; } /* Handle newlines */ if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) { if (dest->len > 0 && dest->data[dest->len - 1] != '\n') { g_byte_array_append (dest, "\r\n", 2); if (content_tag) { if (content_tag->content_length == 0) { /* * Special case * we have a \r\n at the beginning but * we have no set content_offset * so we need to do it here */ content_tag->content_offset = dest->len; } else { content_tag->content_length += 2; } } } save_space = FALSE; } if ((cur_tag->id == Tag_P || cur_tag->id == Tag_TR || cur_tag->id == Tag_DIV)) { if (dest->len > 0 && dest->data[dest->len - 1] != '\n') { g_byte_array_append (dest, "\r\n", 2); if (content_tag) { if (content_tag->content_length == 0) { /* * Special case * we have a \r\n at the beginning but * we have no set content_offset * so we need to get it here */ content_tag->content_offset = dest->len; } else { content_tag->content_length += 2; } } } save_space = FALSE; } if (cur_tag->flags & FL_HREF) { if (!(cur_tag->flags & (FL_CLOSING))) { url = rspamd_html_process_url_tag (pool, cur_tag, hc); if (url != NULL) { if (url_set != NULL) { if (!rspamd_url_set_add_or_increase (url_set, url)) { rspamd_process_html_url (pool, url, url_set); } } href_offset = dest->len; } } if (cur_tag->id == Tag_A) { if (!balanced && cur_level && cur_level->prev) { struct html_tag *prev_tag; struct rspamd_url *prev_url; prev_tag = cur_level->prev->data; if (prev_tag->id == Tag_A && !(prev_tag->flags & (FL_CLOSING)) && prev_tag->extra) { prev_url = prev_tag->extra; rspamd_html_check_displayed_url (pool, exceptions, url_set, dest, href_offset, prev_url); } } if (cur_tag->flags & (FL_CLOSING)) { /* Insert exception */ if (url != NULL && (gint) dest->len > href_offset) { rspamd_html_check_displayed_url (pool, exceptions, url_set, dest, href_offset, url); } href_offset = -1; url = NULL; } } } else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) { /* * Base is allowed only within head tag but HTML is retarded */ if (hc->base_url == NULL) { url = rspamd_html_process_url_tag (pool, cur_tag, hc); if (url != NULL) { msg_debug_html ("got valid base tag"); hc->base_url = url; cur_tag->extra = url; } else { msg_debug_html ("got invalid base tag!"); } } } if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { rspamd_html_process_img_tag (pool, cur_tag, hc, url_set); } else if (cur_tag->flags & FL_BLOCK) { struct html_block *bl; if (cur_tag->flags & FL_CLOSING) { /* Just remove block element from the queue if any */ if (styles_blocks->length > 0) { g_queue_pop_tail (styles_blocks); } } else { rspamd_html_process_block_tag (pool, cur_tag, hc); bl = cur_tag->extra; if (bl) { rspamd_html_propagate_style (hc, cur_tag, cur_tag->extra, styles_blocks); /* Check visibility */ if (bl->font_size < 3 || bl->font_color.d.comp.alpha < 10) { bl->visible = FALSE; msg_debug_html ("tag is not visible: font size: " "%d, alpha: %d", (int)bl->font_size, (int)bl->font_color.d.comp.alpha); } if (!bl->visible) { state = content_ignore; } } } } } else { state = content_write; } p++; c = p; cur_tag = NULL; break; } } if (hc->html_tags) { g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1, rspamd_html_propagate_lengths, NULL); } g_queue_free (styles_blocks); hc->parsed = dest; return dest; } GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in) { return rspamd_html_process_part_full (pool, hc, in, NULL, NULL); }