/* * Copyright (c) 2009-2012, Vsevolod Stakhov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "util.h" #include "main.h" #include "message.h" #include "html.h" #include "url.h" static sig_atomic_t tags_sorted = 0; static struct html_tag tag_defs[] = { /* W3C defined elements */ {Tag_A, "a", (CM_INLINE)}, {Tag_ABBR, "abbr", (CM_INLINE)}, {Tag_ACRONYM, "acronym", (CM_INLINE)}, {Tag_ADDRESS, "address", (CM_BLOCK)}, {Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)}, {Tag_AREA, "area", (CM_BLOCK | CM_EMPTY)}, {Tag_B, "b", (CM_INLINE)}, {Tag_BASE, "base", (CM_HEAD | CM_EMPTY)}, {Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)}, {Tag_BDO, "bdo", (CM_INLINE)}, {Tag_BIG, "big", (CM_INLINE)}, {Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)}, {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST)}, {Tag_BR, "br", (CM_INLINE | CM_EMPTY)}, {Tag_BUTTON, "button", (CM_INLINE)}, {Tag_CAPTION, "caption", (CM_TABLE)}, {Tag_CENTER, "center", (CM_BLOCK)}, {Tag_CITE, "cite", (CM_INLINE)}, {Tag_CODE, "code", (CM_INLINE)}, {Tag_COL, "col", (CM_TABLE | CM_EMPTY)}, {Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)}, {Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)}, {Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)}, {Tag_DFN, "dfn", (CM_INLINE)}, {Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)}, {Tag_DIV, "div", (CM_BLOCK)}, {Tag_DL, "dl", (CM_BLOCK)}, {Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)}, {Tag_EM, "em", (CM_INLINE)}, {Tag_FIELDSET, "fieldset", (CM_BLOCK)}, {Tag_FONT, "font", (CM_INLINE)}, {Tag_FORM, "form", (CM_BLOCK)}, {Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY)}, {Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)}, {Tag_H1, "h1", (CM_BLOCK | CM_HEADING)}, {Tag_H2, "h2", (CM_BLOCK | CM_HEADING)}, {Tag_H3, "h3", (CM_BLOCK | CM_HEADING)}, {Tag_H4, "h4", (CM_BLOCK | CM_HEADING)}, {Tag_H5, "h5", (CM_BLOCK | CM_HEADING)}, {Tag_H6, "h6", (CM_BLOCK | CM_HEADING)}, {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST)}, {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)}, {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST)}, {Tag_I, "i", (CM_INLINE)}, {Tag_IFRAME, "iframe", (CM_INLINE)}, {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)}, {Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)}, {Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)}, {Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)}, {Tag_KBD, "kbd", (CM_INLINE)}, {Tag_LABEL, "label", (CM_INLINE)}, {Tag_LEGEND, "legend", (CM_INLINE)}, {Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT)}, {Tag_LINK, "link", (CM_HEAD | CM_EMPTY)}, {Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)}, {Tag_MAP, "map", (CM_INLINE)}, {Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)}, {Tag_META, "meta", (CM_HEAD | CM_EMPTY)}, {Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)}, {Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)}, {Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)}, {Tag_OL, "ol", (CM_BLOCK)}, {Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)}, {Tag_OPTION, "option", (CM_FIELD | CM_OPT)}, {Tag_P, "p", (CM_BLOCK | CM_OPT)}, {Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)}, {Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)}, {Tag_PRE, "pre", (CM_BLOCK)}, {Tag_Q, "q", (CM_INLINE)}, {Tag_RB, "rb", (CM_INLINE)}, {Tag_RBC, "rbc", (CM_INLINE)}, {Tag_RP, "rp", (CM_INLINE)}, {Tag_RT, "rt", (CM_INLINE)}, {Tag_RTC, "rtc", (CM_INLINE)}, {Tag_RUBY, "ruby", (CM_INLINE)}, {Tag_S, "s", (CM_INLINE)}, {Tag_SAMP, "samp", (CM_INLINE)}, {Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)}, {Tag_SELECT, "select", (CM_INLINE | CM_FIELD)}, {Tag_SMALL, "small", (CM_INLINE)}, {Tag_SPAN, "span", (CM_INLINE)}, {Tag_STRIKE, "strike", (CM_INLINE)}, {Tag_STRONG, "strong", (CM_INLINE)}, {Tag_STYLE, "style", (CM_HEAD)}, {Tag_SUB, "sub", (CM_INLINE)}, {Tag_SUP, "sup", (CM_INLINE)}, {Tag_TABLE, "table", (CM_BLOCK)}, {Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT)}, {Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT)}, {Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)}, {Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)}, {Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT)}, {Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)}, {Tag_TITLE, "title", (CM_HEAD)}, {Tag_TR, "tr", (CM_TABLE | CM_OPT)}, {Tag_TT, "tt", (CM_INLINE)}, {Tag_U, "u", (CM_INLINE)}, {Tag_UL, "ul", (CM_BLOCK)}, {Tag_VAR, "var", (CM_INLINE)}, {Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)}, {Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)}, /* proprietary elements */ {Tag_ALIGN, "align", (CM_BLOCK)}, {Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)}, {Tag_BLINK, "blink", (CM_INLINE)}, {Tag_COMMENT, "comment", (CM_INLINE)}, {Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)}, {Tag_ILAYER, "ilayer", (CM_INLINE)}, {Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)}, {Tag_LAYER, "layer", (CM_BLOCK)}, {Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)}, {Tag_MULTICOL, "multicol", (CM_BLOCK)}, {Tag_NOBR, "nobr", (CM_INLINE)}, {Tag_NOEMBED, "noembed", (CM_INLINE)}, {Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)}, {Tag_NOSAVE, "nosave", (CM_BLOCK)}, {Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)}, {Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)}, {Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)}, {Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)}, }; static sig_atomic_t entities_sorted = 0; struct _entity; typedef struct _entity entity; struct _entity { gchar *name; uint code; gchar *replacement; }; static entity entities_defs[] = { /* ** Markup pre-defined character entities */ {"quot", 34, "\""}, {"amp", 38, "&"}, {"apos", 39, "'"}, {"lt", 60, "<"}, {"gt", 62, ">"}, /* ** Latin-1 character entities */ {"nbsp", 160, " "}, {"iexcl", 161, "!"}, {"cent", 162, "cent"}, {"pound", 163, "pound"}, {"curren", 164, "current"}, {"yen", 165, "yen"}, {"brvbar", 166, NULL}, {"sect", 167, NULL}, {"uml", 168, "uml"}, {"copy", 169, "c"}, {"ordf", 170, NULL}, {"laquo", 171, "\""}, {"not", 172, "!"}, {"shy", 173, NULL}, {"reg", 174, "r"}, {"macr", 175, NULL}, {"deg", 176, "deg"}, {"plusmn", 177, "+-"}, {"sup2", 178, "2"}, {"sup3", 179, "3"}, {"acute", 180, NULL}, {"micro", 181, NULL}, {"para", 182, NULL}, {"middot", 183, "."}, {"cedil", 184, NULL}, {"sup1", 185, "1"}, {"ordm", 186, NULL}, {"raquo", 187, "\""}, {"frac14", 188, "1/4"}, {"frac12", 189, "1/2"}, {"frac34", 190, "3/4"}, {"iquest", 191, "i"}, {"Agrave", 192, "a"}, {"Aacute", 193, "a"}, {"Acirc", 194, "a"}, {"Atilde", 195, "a"}, {"Auml", 196, "a"}, {"Aring", 197, "a"}, {"AElig", 198, "a"}, {"Ccedil", 199, "c"}, {"Egrave", 200, "e"}, {"Eacute", 201, "e"}, {"Ecirc", 202, "e"}, {"Euml", 203, "e"}, {"Igrave", 204, "i"}, {"Iacute", 205, "i"}, {"Icirc", 206, "i"}, {"Iuml", 207, "i"}, {"ETH", 208, "e"}, {"Ntilde", 209, "n"}, {"Ograve", 210, "o"}, {"Oacute", 211, "o"}, {"Ocirc", 212, "o"}, {"Otilde", 213, "o"}, {"Ouml", 214, "o"}, {"times", 215, "t"}, {"Oslash", 216, "o"}, {"Ugrave", 217, "u"}, {"Uacute", 218, "u"}, {"Ucirc", 219, "u"}, {"Uuml", 220, "u"}, {"Yacute", 221, "y"}, {"THORN", 222, "t"}, {"szlig", 223, "s"}, {"agrave", 224, "a"}, {"aacute", 225, "a"}, {"acirc", 226, "a"}, {"atilde", 227, "a"}, {"auml", 228, "a"}, {"aring", 229, "a"}, {"aelig", 230, "a"}, {"ccedil", 231, "c"}, {"egrave", 232, "e"}, {"eacute", 233, "e"}, {"ecirc", 234, "e"}, {"euml", 235, "e"}, {"igrave", 236, "e"}, {"iacute", 237, "e"}, {"icirc", 238, "e"}, {"iuml", 239, "e"}, {"eth", 240, "e"}, {"ntilde", 241, "n"}, {"ograve", 242, "o"}, {"oacute", 243, "o"}, {"ocirc", 244, "o"}, {"otilde", 245, "o"}, {"ouml", 246, "o"}, {"divide", 247, "/"}, {"oslash", 248, "/"}, {"ugrave", 249, "u"}, {"uacute", 250, "u"}, {"ucirc", 251, "u"}, {"uuml", 252, "u"}, {"yacute", 253, "y"}, {"thorn", 254, "t"}, {"yuml", 255, "y"}, /* ** Extended Entities defined in HTML 4: Symbols */ {"fnof", 402, "f"}, {"Alpha", 913, "alpha"}, {"Beta", 914, "beta"}, {"Gamma", 915, "gamma"}, {"Delta", 916, "delta"}, {"Epsilon", 917, "epsilon"}, {"Zeta", 918, "zeta"}, {"Eta", 919, "eta"}, {"Theta", 920, "theta"}, {"Iota", 921, "iota"}, {"Kappa", 922, "kappa"}, {"Lambda", 923, "lambda"}, {"Mu", 924, "mu"}, {"Nu", 925, "nu"}, {"Xi", 926, "xi"}, {"Omicron", 927, "omicron"}, {"Pi", 928, "pi"}, {"Rho", 929, "rho"}, {"Sigma", 931, "sigma"}, {"Tau", 932, "tau"}, {"Upsilon", 933, "upsilon"}, {"Phi", 934, "phi"}, {"Chi", 935, "chi"}, {"Psi", 936, "psi"}, {"Omega", 937, "omega"}, {"alpha", 945, "alpha"}, {"beta", 946, "beta"}, {"gamma", 947, "gamma"}, {"delta", 948, "delta"}, {"epsilon", 949, "epsilon"}, {"zeta", 950, "zeta"}, {"eta", 951, "eta"}, {"theta", 952, "theta"}, {"iota", 953, "iota"}, {"kappa", 954, "kappa"}, {"lambda", 955, "lambda"}, {"mu", 956, "mu"}, {"nu", 957, "nu"}, {"xi", 958, "xi"}, {"omicron", 959, "omicron"}, {"pi", 960, "pi"}, {"rho", 961, "rho"}, {"sigmaf", 962, "sigmaf"}, {"sigma", 963, "sigma"}, {"tau", 964, "tau"}, {"upsilon", 965, "upsilon"}, {"phi", 966, "phi"}, {"chi", 967, "chi"}, {"psi", 968, "psi"}, {"omega", 969, "omega"}, {"thetasym", 977, "thetasym"}, {"upsih", 978, "upsih"}, {"piv", 982, "piv"}, {"bull", 8226, "bull"}, {"hellip", 8230, "..."}, {"prime", 8242, "'"}, {"Prime", 8243, "'"}, {"oline", 8254, "-"}, {"frasl", 8260, NULL}, {"weierp", 8472, NULL}, {"image", 8465, NULL}, {"real", 8476, NULL}, {"trade", 8482, NULL}, {"alefsym", 8501, "a"}, {"larr", 8592, NULL}, {"uarr", 8593, NULL}, {"rarr", 8594, NULL}, {"darr", 8595, NULL}, {"harr", 8596, NULL}, {"crarr", 8629, NULL}, {"lArr", 8656, NULL}, {"uArr", 8657, NULL}, {"rArr", 8658, NULL}, {"dArr", 8659, NULL}, {"hArr", 8660, NULL}, {"forall", 8704, NULL}, {"part", 8706, NULL}, {"exist", 8707, NULL}, {"empty", 8709, NULL}, {"nabla", 8711, NULL}, {"isin", 8712, NULL}, {"notin", 8713, NULL}, {"ni", 8715, NULL}, {"prod", 8719, NULL}, {"sum", 8721, "E"}, {"minus", 8722, "-"}, {"lowast", 8727, NULL}, {"radic", 8730, NULL}, {"prop", 8733, NULL}, {"infin", 8734, NULL}, {"ang", 8736, "'"}, {"and", 8743, "&"}, {"or", 8744, "|"}, {"cap", 8745, NULL}, {"cup", 8746, NULL}, {"gint", 8747, NULL}, {"there4", 8756, NULL}, {"sim", 8764, NULL}, {"cong", 8773, NULL}, {"asymp", 8776, NULL}, {"ne", 8800, "!="}, {"equiv", 8801, "=="}, {"le", 8804, "<="}, {"ge", 8805, ">="}, {"sub", 8834, NULL}, {"sup", 8835, NULL}, {"nsub", 8836, NULL}, {"sube", 8838, NULL}, {"supe", 8839, NULL}, {"oplus", 8853, NULL}, {"otimes", 8855, NULL}, {"perp", 8869, NULL}, {"sdot", 8901, NULL}, {"lceil", 8968, NULL}, {"rceil", 8969, NULL}, {"lfloor", 8970, NULL}, {"rfloor", 8971, NULL}, {"lang", 9001, NULL}, {"rang", 9002, NULL}, {"loz", 9674, NULL}, {"spades", 9824, NULL}, {"clubs", 9827, NULL}, {"hearts", 9829, NULL}, {"diams", 9830, NULL}, /* ** Extended Entities defined in HTML 4: Special (less Markup at top) */ {"OElig", 338, NULL}, {"oelig", 339, NULL}, {"Scaron", 352, NULL}, {"scaron", 353, NULL}, {"Yuml", 376, NULL}, {"circ", 710, NULL}, {"tilde", 732, NULL}, {"ensp", 8194, NULL}, {"emsp", 8195, NULL}, {"thinsp", 8201, NULL}, {"zwnj", 8204, NULL}, {"zwj", 8205, NULL}, {"lrm", 8206, NULL}, {"rlm", 8207, NULL}, {"ndash", 8211, "-"}, {"mdash", 8212, "-"}, {"lsquo", 8216, "'"}, {"rsquo", 8217, "'"}, {"sbquo", 8218, "\""}, {"ldquo", 8220, "\""}, {"rdquo", 8221, "\""}, {"bdquo", 8222, "\""}, {"dagger", 8224, "T"}, {"Dagger", 8225, "T"}, {"permil", 8240, NULL}, {"lsaquo", 8249, "\""}, {"rsaquo", 8250, "\""}, {"euro", 8364, "E"}, }; static entity entities_defs_num[ (G_N_ELEMENTS (entities_defs)) ]; static gint tag_cmp (const void *m1, const void *m2) { const struct html_tag *p1 = m1; const struct html_tag *p2 = m2; return g_ascii_strcasecmp (p1->name, p2->name); } static gint entity_cmp (const void *m1, const void *m2) { const entity *p1 = m1; const entity *p2 = m2; return g_ascii_strcasecmp (p1->name, p2->name); } static gint entity_cmp_num (const void *m1, const void *m2) { const entity *p1 = m1; const entity *p2 = m2; return p1->code - p2->code; } static GNode * construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) { struct html_node *html; GNode *n = NULL; struct html_tag key, *found; gchar t; if (text == NULL || *text == '\0') { return NULL; } html = rspamd_mempool_alloc0 (pool, sizeof (struct html_node)); /* Check whether this tag is fully closed */ if (*(text + tag_len - 1) == '/') { html->flags |= FL_CLOSED; } /* Check xml tag */ if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) { html->flags |= FL_XML; html->tag = NULL; } else if (*text == '!') { html->flags |= FL_SGML; html->tag = NULL; } else { if (*text == '/') { html->flags |= FL_CLOSING; text++; } /* Find end of tag name */ key.name = text; while (*text && g_ascii_isalnum (*(++text))) ; t = *text; *text = '\0'; /* Match tag id by tag name */ if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) { *text = t; html->tag = found; } else { *text = t; return NULL; } } n = g_node_new (html); return n; } static gboolean check_balance (GNode * node, GNode ** cur_level) { struct html_node *arg = node->data, *tmp; GNode *cur; if (arg->flags & FL_CLOSING) { /* First of all check whether this tag is closing tag for parent node */ cur = node->parent; while (cur && cur->data) { tmp = cur->data; if ((tmp->tag && arg->tag) && tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) { tmp->flags |= FL_CLOSED; /* Destroy current node as we find corresponding parent node */ g_node_destroy (node); /* Change level */ *cur_level = cur->parent; return TRUE; } cur = cur->parent; } } else { return TRUE; } return FALSE; } struct html_tag * get_tag_by_name (const gchar *name) { struct html_tag key; key.name = name; return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); } /* Decode HTML entitles in text */ void decode_entitles (gchar *s, guint * len) { guint l, rep_len; gchar *t = s; /* t - tortoise */ gchar *h = s; /* h - hare */ gchar *e = s; gchar *end_ptr; gint state = 0, val, base; entity *found, key; if (len == NULL || *len == 0) { l = strlen (s); } else { l = *len; } while (h - s < (gint)l) { switch (state) { /* Out of entitle */ case 0: if (*h == '&') { state = 1; e = h; h++; continue; } else { *t = *h; h++; t++; } break; case 1: if (*h == ';') { /* Determine base */ /* First find in entities table */ key.name = e + 1; *h = '\0'; if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) { if (found->replacement) { rep_len = strlen (found->replacement); memcpy (t, found->replacement, rep_len); t += rep_len; } } else { if (*(e + 2) == 'x' || *(e + 2) == 'X') { base = 16; } else if (*(e + 2) == 'o' || *(e + 2) == 'O') { base = 8; } else { base = 10; } if (base == 10) { val = strtoul ((e + 2), &end_ptr, base); } else { val = strtoul ((e + 3), &end_ptr, base); } if (end_ptr != NULL && *end_ptr != '\0') { /* Skip undecoded */ t = h; } else { /* Search for a replacement */ key.code = val; found = bsearch (&key, entities_defs_num, G_N_ELEMENTS ( entities_defs), sizeof (entity), entity_cmp_num); if (found) { if (found->replacement) { rep_len = strlen (found->replacement); memcpy (t, found->replacement, rep_len); t += rep_len; } } } } *h = ';'; state = 0; } h++; break; } } *t = '\0'; if (len != NULL) { *len = t - s; } } static void check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url_text, gsize remain, tag_id_t id) { struct uri *new; gchar *url_str; const gchar *p, *c; gchar tagbuf[128]; struct html_tag *tag; gsize len = 0; gint rc; p = url_text; while (len < remain) { if (*p == '<') { /* Check tag name */ if (*(p + 1) == '/') { c = p + 2; } else { c = p + 1; } while (len < remain) { if (!g_ascii_isspace (*p) && *p != '>') { p++; len++; } else { break; } } rspamd_strlcpy (tagbuf, c, MIN ((gint)sizeof(tagbuf), p - c + 1)); if ((tag = get_tag_by_name (tagbuf)) != NULL) { if (tag->id == id) { break; } else if (tag->id == Tag_IMG) { /* We should ignore IMG tag here */ while (len < remain && *p != '>' && *p != '<') { p++; len++; } if (*p == '>' && len < remain) { p++; } remain -= p - url_text; url_text = p; len = 0; continue; } } } len++; p++; } if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, TRUE) && url_str != NULL) { new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri)); if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, task->task_pool); if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { if (g_ascii_strncasecmp (href_url->host, new->host, MAX (href_url->hostlen, new->hostlen)) != 0) { /* Special check for urls beginning with 'www' */ if (new->hostlen > 4 && href_url->hostlen > 4) { p = new->host; c = NULL; if ((p[0] == 'w' || p[0] == 'W') && (p[1] == 'w' || p[1] == 'W') && (p[2] == 'w' || p[2] == 'W') && (p[3] == '.')) { p += 4; c = href_url->host; len = MAX (href_url->hostlen, new->hostlen - 4); } else { p = href_url->host; if ((p[0] == 'w' || p[0] == 'W') && (p[1] == 'w' || p[1] == 'W') && (p[2] == 'w' || p[2] == 'W') && (p[3] == '.')) { p += 4; c = new->host; len = MAX (href_url->hostlen - 4, new->hostlen); } } /* Compare parts and check for phished hostname */ if (c != NULL) { if (g_ascii_strncasecmp (p, c, len) != 0) { href_url->is_phished = TRUE; href_url->phished_url = new; } } else { href_url->is_phished = TRUE; href_url->phished_url = new; } } else { href_url->is_phished = TRUE; href_url->phished_url = new; } } } else { msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); } } } } static void parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text, gsize tag_len, gsize remain) { gchar *c = NULL, *p, *url_text; gint len, rc; struct uri *url; gboolean got_single_quote = FALSE, got_double_quote = FALSE; /* For A tags search for href= and for IMG tags search for src= */ if (id == Tag_A) { c = rspamd_strncasestr (tag_text, "href=", tag_len); len = sizeof ("href=") - 1; } else if (id == Tag_IMG) { c = rspamd_strncasestr (tag_text, "src=", tag_len); len = sizeof ("src=") - 1; } if (c != NULL) { /* First calculate length */ c += len; /* Skip spaces after eqsign */ while (g_ascii_isspace (*c)) { c++; } len = 0; p = c; while (*p && (guint)(p - tag_text) < tag_len) { if (got_double_quote) { if (*p == '"') { break; } else { len++; } } else if (got_single_quote) { if (*p == '\'') { break; } else { len++; } } else if (g_ascii_isspace (*p) || *p == '>' || (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') { break; } else { if (*p == '"' && !got_single_quote) { got_double_quote = !got_double_quote; } else if (*p == '\'' && !got_double_quote) { got_single_quote = !got_single_quote; } else { len++; } } p++; } if (got_single_quote || got_double_quote) { c++; } if (len == 0) { return; } url_text = rspamd_mempool_alloc (task->task_pool, len + 1); rspamd_strlcpy (url_text, c, len + 1); rspamd_url_unescape (url_text); decode_entitles (url_text, NULL); if (g_ascii_strncasecmp (url_text, "http", sizeof ("http") - 1) != 0 && g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 && g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0 && g_ascii_strncasecmp (url_text, "mailto:", sizeof ("mailto:") - 1) != 0) { return; } url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri)); rc = parse_uri (url, url_text, task->task_pool); if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) { /* * Check for phishing */ if ((p = strchr (c, '>')) != NULL && id == Tag_A) { p++; check_phishing (task, url, p, remain - (p - tag_text), id); } if (g_tree_lookup (task->urls, url) == NULL) { g_tree_insert (task->urls, url, url); } } } } gboolean add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_text_part *part, gchar *tag_text, gsize tag_len, gsize remain, GNode ** cur_level) { GNode *new; struct html_node *data; if (!tags_sorted) { qsort (tag_defs, G_N_ELEMENTS ( tag_defs), sizeof (struct html_tag), tag_cmp); tags_sorted = 1; } if (!entities_sorted) { qsort (entities_defs, G_N_ELEMENTS ( entities_defs), sizeof (entity), entity_cmp); memcpy (entities_defs_num, entities_defs, sizeof (entities_defs)); qsort (entities_defs_num, G_N_ELEMENTS ( entities_defs), sizeof (entity), entity_cmp_num); entities_sorted = 1; } /* First call of this function */ if (part->html_nodes == NULL) { /* Insert root node */ new = g_node_new (NULL); *cur_level = new; part->html_nodes = new; rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, part->html_nodes); /* Call once again with root node */ return add_html_node (task, pool, part, tag_text, tag_len, remain, cur_level); } else { new = construct_html_node (pool, tag_text, tag_len); if (new == NULL) { debug_task ("cannot construct HTML node for text '%*s'", tag_len, tag_text); return FALSE; } data = new->data; if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { parse_tag_url (task, part, data->tag->id, tag_text, tag_len, remain); } if (data->flags & FL_CLOSING) { if (!*cur_level) { debug_task ("bad parent node"); return FALSE; } g_node_append (*cur_level, new); if (!check_balance (new, cur_level)) { debug_task ( "mark part as unbalanced as it has not pairable closing tags"); part->is_balanced = FALSE; } } else if ((data->flags & (FL_XML|FL_SGML)) == 0) { g_node_append (*cur_level, new); if ((data->flags & FL_CLOSED) == 0) { *cur_level = new; } /* Skip some tags */ if (data->tag && (data->tag->id == Tag_STYLE || data->tag->id == Tag_SCRIPT || data->tag->id == Tag_OBJECT || data->tag->id == Tag_TITLE)) { return FALSE; } } else { /* Destroy ignored nodes */ g_node_destroy (new); } } return TRUE; } /* * vi:ts=4 */