diff options
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r-- | src/libserver/html.c | 942 |
1 files changed, 942 insertions, 0 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c new file mode 100644 index 000000000..028c54f6c --- /dev/null +++ b/src/libserver/html.c @@ -0,0 +1,942 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "util.h" +#include "main.h" +#include "message.h" +#include "html.h" +#include "url.h" + +static sig_atomic_t tags_sorted = 0; + +static struct html_tag tag_defs[] = { + /* W3C defined elements */ + {Tag_A, "a", (CM_INLINE)}, + {Tag_ABBR, "abbr", (CM_INLINE)}, + {Tag_ACRONYM, "acronym", (CM_INLINE)}, + {Tag_ADDRESS, "address", (CM_BLOCK)}, + {Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)}, + {Tag_AREA, "area", (CM_BLOCK | CM_EMPTY)}, + {Tag_B, "b", (CM_INLINE)}, + {Tag_BASE, "base", (CM_HEAD | CM_EMPTY)}, + {Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)}, + {Tag_BDO, "bdo", (CM_INLINE)}, + {Tag_BIG, "big", (CM_INLINE)}, + {Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)}, + {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST)}, + {Tag_BR, "br", (CM_INLINE | CM_EMPTY)}, + {Tag_BUTTON, "button", (CM_INLINE)}, + {Tag_CAPTION, "caption", (CM_TABLE)}, + {Tag_CENTER, "center", (CM_BLOCK)}, + {Tag_CITE, "cite", (CM_INLINE)}, + {Tag_CODE, "code", (CM_INLINE)}, + {Tag_COL, "col", (CM_TABLE | CM_EMPTY)}, + {Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)}, + {Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)}, + {Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)}, + {Tag_DFN, "dfn", (CM_INLINE)}, + {Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)}, + {Tag_DIV, "div", (CM_BLOCK)}, + {Tag_DL, "dl", (CM_BLOCK)}, + {Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)}, + {Tag_EM, "em", (CM_INLINE)}, + {Tag_FIELDSET, "fieldset", (CM_BLOCK)}, + {Tag_FONT, "font", (CM_INLINE)}, + {Tag_FORM, "form", (CM_BLOCK)}, + {Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY)}, + {Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)}, + {Tag_H1, "h1", (CM_BLOCK | CM_HEADING)}, + {Tag_H2, "h2", (CM_BLOCK | CM_HEADING)}, + {Tag_H3, "h3", (CM_BLOCK | CM_HEADING)}, + {Tag_H4, "h4", (CM_BLOCK | CM_HEADING)}, + {Tag_H5, "h5", (CM_BLOCK | CM_HEADING)}, + {Tag_H6, "h6", (CM_BLOCK | CM_HEADING)}, + {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST)}, + {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)}, + {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST)}, + {Tag_I, "i", (CM_INLINE)}, + {Tag_IFRAME, "iframe", (CM_INLINE)}, + {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)}, + {Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)}, + {Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)}, + {Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)}, + {Tag_KBD, "kbd", (CM_INLINE)}, + {Tag_LABEL, "label", (CM_INLINE)}, + {Tag_LEGEND, "legend", (CM_INLINE)}, + {Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT)}, + {Tag_LINK, "link", (CM_HEAD | CM_EMPTY)}, + {Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)}, + {Tag_MAP, "map", (CM_INLINE)}, + {Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)}, + {Tag_META, "meta", (CM_HEAD | CM_EMPTY)}, + {Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)}, + {Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)}, + {Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)}, + {Tag_OL, "ol", (CM_BLOCK)}, + {Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)}, + {Tag_OPTION, "option", (CM_FIELD | CM_OPT)}, + {Tag_P, "p", (CM_BLOCK | CM_OPT)}, + {Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)}, + {Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)}, + {Tag_PRE, "pre", (CM_BLOCK)}, + {Tag_Q, "q", (CM_INLINE)}, + {Tag_RB, "rb", (CM_INLINE)}, + {Tag_RBC, "rbc", (CM_INLINE)}, + {Tag_RP, "rp", (CM_INLINE)}, + {Tag_RT, "rt", (CM_INLINE)}, + {Tag_RTC, "rtc", (CM_INLINE)}, + {Tag_RUBY, "ruby", (CM_INLINE)}, + {Tag_S, "s", (CM_INLINE)}, + {Tag_SAMP, "samp", (CM_INLINE)}, + {Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)}, + {Tag_SELECT, "select", (CM_INLINE | CM_FIELD)}, + {Tag_SMALL, "small", (CM_INLINE)}, + {Tag_SPAN, "span", (CM_INLINE)}, + {Tag_STRIKE, "strike", (CM_INLINE)}, + {Tag_STRONG, "strong", (CM_INLINE)}, + {Tag_STYLE, "style", (CM_HEAD)}, + {Tag_SUB, "sub", (CM_INLINE)}, + {Tag_SUP, "sup", (CM_INLINE)}, + {Tag_TABLE, "table", (CM_BLOCK)}, + {Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT)}, + {Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT)}, + {Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)}, + {Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)}, + {Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT)}, + {Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)}, + {Tag_TITLE, "title", (CM_HEAD)}, + {Tag_TR, "tr", (CM_TABLE | CM_OPT)}, + {Tag_TT, "tt", (CM_INLINE)}, + {Tag_U, "u", (CM_INLINE)}, + {Tag_UL, "ul", (CM_BLOCK)}, + {Tag_VAR, "var", (CM_INLINE)}, + {Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)}, + {Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)}, + + /* proprietary elements */ + {Tag_ALIGN, "align", (CM_BLOCK)}, + {Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)}, + {Tag_BLINK, "blink", (CM_INLINE)}, + {Tag_COMMENT, "comment", (CM_INLINE)}, + {Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)}, + {Tag_ILAYER, "ilayer", (CM_INLINE)}, + {Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)}, + {Tag_LAYER, "layer", (CM_BLOCK)}, + {Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)}, + {Tag_MULTICOL, "multicol", (CM_BLOCK)}, + {Tag_NOBR, "nobr", (CM_INLINE)}, + {Tag_NOEMBED, "noembed", (CM_INLINE)}, + {Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)}, + {Tag_NOSAVE, "nosave", (CM_BLOCK)}, + {Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)}, + {Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)}, + {Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)}, + {Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)}, +}; + +static sig_atomic_t entities_sorted = 0; +struct _entity; +typedef struct _entity entity; + +struct _entity { + gchar *name; + uint code; + gchar *replacement; +}; + + +static entity entities_defs[] = { + /* + ** Markup pre-defined character entities + */ + {"quot", 34, "\""}, + {"amp", 38, "&"}, + {"apos", 39, "'"}, + {"lt", 60, "<"}, + {"gt", 62, ">"}, + + /* + ** Latin-1 character entities + */ + {"nbsp", 160, " "}, + {"iexcl", 161, "!"}, + {"cent", 162, "cent"}, + {"pound", 163, "pound"}, + {"curren", 164, "current"}, + {"yen", 165, "yen"}, + {"brvbar", 166, NULL}, + {"sect", 167, NULL}, + {"uml", 168, "uml"}, + {"copy", 169, "c"}, + {"ordf", 170, NULL}, + {"laquo", 171, "\""}, + {"not", 172, "!"}, + {"shy", 173, NULL}, + {"reg", 174, "r"}, + {"macr", 175, NULL}, + {"deg", 176, "deg"}, + {"plusmn", 177, "+-"}, + {"sup2", 178, "2"}, + {"sup3", 179, "3"}, + {"acute", 180, NULL}, + {"micro", 181, NULL}, + {"para", 182, NULL}, + {"middot", 183, "."}, + {"cedil", 184, NULL}, + {"sup1", 185, "1"}, + {"ordm", 186, NULL}, + {"raquo", 187, "\""}, + {"frac14", 188, "1/4"}, + {"frac12", 189, "1/2"}, + {"frac34", 190, "3/4"}, + {"iquest", 191, "i"}, + {"Agrave", 192, "a"}, + {"Aacute", 193, "a"}, + {"Acirc", 194, "a"}, + {"Atilde", 195, "a"}, + {"Auml", 196, "a"}, + {"Aring", 197, "a"}, + {"AElig", 198, "a"}, + {"Ccedil", 199, "c"}, + {"Egrave", 200, "e"}, + {"Eacute", 201, "e"}, + {"Ecirc", 202, "e"}, + {"Euml", 203, "e"}, + {"Igrave", 204, "i"}, + {"Iacute", 205, "i"}, + {"Icirc", 206, "i"}, + {"Iuml", 207, "i"}, + {"ETH", 208, "e"}, + {"Ntilde", 209, "n"}, + {"Ograve", 210, "o"}, + {"Oacute", 211, "o"}, + {"Ocirc", 212, "o"}, + {"Otilde", 213, "o"}, + {"Ouml", 214, "o"}, + {"times", 215, "t"}, + {"Oslash", 216, "o"}, + {"Ugrave", 217, "u"}, + {"Uacute", 218, "u"}, + {"Ucirc", 219, "u"}, + {"Uuml", 220, "u"}, + {"Yacute", 221, "y"}, + {"THORN", 222, "t"}, + {"szlig", 223, "s"}, + {"agrave", 224, "a"}, + {"aacute", 225, "a"}, + {"acirc", 226, "a"}, + {"atilde", 227, "a"}, + {"auml", 228, "a"}, + {"aring", 229, "a"}, + {"aelig", 230, "a"}, + {"ccedil", 231, "c"}, + {"egrave", 232, "e"}, + {"eacute", 233, "e"}, + {"ecirc", 234, "e"}, + {"euml", 235, "e"}, + {"igrave", 236, "e"}, + {"iacute", 237, "e"}, + {"icirc", 238, "e"}, + {"iuml", 239, "e"}, + {"eth", 240, "e"}, + {"ntilde", 241, "n"}, + {"ograve", 242, "o"}, + {"oacute", 243, "o"}, + {"ocirc", 244, "o"}, + {"otilde", 245, "o"}, + {"ouml", 246, "o"}, + {"divide", 247, "/"}, + {"oslash", 248, "/"}, + {"ugrave", 249, "u"}, + {"uacute", 250, "u"}, + {"ucirc", 251, "u"}, + {"uuml", 252, "u"}, + {"yacute", 253, "y"}, + {"thorn", 254, "t"}, + {"yuml", 255, "y"}, + + /* + ** Extended Entities defined in HTML 4: Symbols + */ + {"fnof", 402, "f"}, + {"Alpha", 913, "alpha"}, + {"Beta", 914, "beta"}, + {"Gamma", 915, "gamma"}, + {"Delta", 916, "delta"}, + {"Epsilon", 917, "epsilon"}, + {"Zeta", 918, "zeta"}, + {"Eta", 919, "eta"}, + {"Theta", 920, "theta"}, + {"Iota", 921, "iota"}, + {"Kappa", 922, "kappa"}, + {"Lambda", 923, "lambda"}, + {"Mu", 924, "mu"}, + {"Nu", 925, "nu"}, + {"Xi", 926, "xi"}, + {"Omicron", 927, "omicron"}, + {"Pi", 928, "pi"}, + {"Rho", 929, "rho"}, + {"Sigma", 931, "sigma"}, + {"Tau", 932, "tau"}, + {"Upsilon", 933, "upsilon"}, + {"Phi", 934, "phi"}, + {"Chi", 935, "chi"}, + {"Psi", 936, "psi"}, + {"Omega", 937, "omega"}, + {"alpha", 945, "alpha"}, + {"beta", 946, "beta"}, + {"gamma", 947, "gamma"}, + {"delta", 948, "delta"}, + {"epsilon", 949, "epsilon"}, + {"zeta", 950, "zeta"}, + {"eta", 951, "eta"}, + {"theta", 952, "theta"}, + {"iota", 953, "iota"}, + {"kappa", 954, "kappa"}, + {"lambda", 955, "lambda"}, + {"mu", 956, "mu"}, + {"nu", 957, "nu"}, + {"xi", 958, "xi"}, + {"omicron", 959, "omicron"}, + {"pi", 960, "pi"}, + {"rho", 961, "rho"}, + {"sigmaf", 962, "sigmaf"}, + {"sigma", 963, "sigma"}, + {"tau", 964, "tau"}, + {"upsilon", 965, "upsilon"}, + {"phi", 966, "phi"}, + {"chi", 967, "chi"}, + {"psi", 968, "psi"}, + {"omega", 969, "omega"}, + {"thetasym", 977, "thetasym"}, + {"upsih", 978, "upsih"}, + {"piv", 982, "piv"}, + {"bull", 8226, "bull"}, + {"hellip", 8230, "..."}, + {"prime", 8242, "'"}, + {"Prime", 8243, "'"}, + {"oline", 8254, "-"}, + {"frasl", 8260, NULL}, + {"weierp", 8472, NULL}, + {"image", 8465, NULL}, + {"real", 8476, NULL}, + {"trade", 8482, NULL}, + {"alefsym", 8501, "a"}, + {"larr", 8592, NULL}, + {"uarr", 8593, NULL}, + {"rarr", 8594, NULL}, + {"darr", 8595, NULL}, + {"harr", 8596, NULL}, + {"crarr", 8629, NULL}, + {"lArr", 8656, NULL}, + {"uArr", 8657, NULL}, + {"rArr", 8658, NULL}, + {"dArr", 8659, NULL}, + {"hArr", 8660, NULL}, + {"forall", 8704, NULL}, + {"part", 8706, NULL}, + {"exist", 8707, NULL}, + {"empty", 8709, NULL}, + {"nabla", 8711, NULL}, + {"isin", 8712, NULL}, + {"notin", 8713, NULL}, + {"ni", 8715, NULL}, + {"prod", 8719, NULL}, + {"sum", 8721, "E"}, + {"minus", 8722, "-"}, + {"lowast", 8727, NULL}, + {"radic", 8730, NULL}, + {"prop", 8733, NULL}, + {"infin", 8734, NULL}, + {"ang", 8736, "'"}, + {"and", 8743, "&"}, + {"or", 8744, "|"}, + {"cap", 8745, NULL}, + {"cup", 8746, NULL}, + {"gint", 8747, NULL}, + {"there4", 8756, NULL}, + {"sim", 8764, NULL}, + {"cong", 8773, NULL}, + {"asymp", 8776, NULL}, + {"ne", 8800, "!="}, + {"equiv", 8801, "=="}, + {"le", 8804, "<="}, + {"ge", 8805, ">="}, + {"sub", 8834, NULL}, + {"sup", 8835, NULL}, + {"nsub", 8836, NULL}, + {"sube", 8838, NULL}, + {"supe", 8839, NULL}, + {"oplus", 8853, NULL}, + {"otimes", 8855, NULL}, + {"perp", 8869, NULL}, + {"sdot", 8901, NULL}, + {"lceil", 8968, NULL}, + {"rceil", 8969, NULL}, + {"lfloor", 8970, NULL}, + {"rfloor", 8971, NULL}, + {"lang", 9001, NULL}, + {"rang", 9002, NULL}, + {"loz", 9674, NULL}, + {"spades", 9824, NULL}, + {"clubs", 9827, NULL}, + {"hearts", 9829, NULL}, + {"diams", 9830, NULL}, + + /* + ** Extended Entities defined in HTML 4: Special (less Markup at top) + */ + {"OElig", 338, NULL}, + {"oelig", 339, NULL}, + {"Scaron", 352, NULL}, + {"scaron", 353, NULL}, + {"Yuml", 376, NULL}, + {"circ", 710, NULL}, + {"tilde", 732, NULL}, + {"ensp", 8194, NULL}, + {"emsp", 8195, NULL}, + {"thinsp", 8201, NULL}, + {"zwnj", 8204, NULL}, + {"zwj", 8205, NULL}, + {"lrm", 8206, NULL}, + {"rlm", 8207, NULL}, + {"ndash", 8211, "-"}, + {"mdash", 8212, "-"}, + {"lsquo", 8216, "'"}, + {"rsquo", 8217, "'"}, + {"sbquo", 8218, "\""}, + {"ldquo", 8220, "\""}, + {"rdquo", 8221, "\""}, + {"bdquo", 8222, "\""}, + {"dagger", 8224, "T"}, + {"Dagger", 8225, "T"}, + {"permil", 8240, NULL}, + {"lsaquo", 8249, "\""}, + {"rsaquo", 8250, "\""}, + {"euro", 8364, "E"}, +}; + +static entity entities_defs_num[ (G_N_ELEMENTS (entities_defs)) ]; + +static gint +tag_cmp (const void *m1, const void *m2) +{ + const struct html_tag *p1 = m1; + const struct html_tag *p2 = m2; + + return g_ascii_strcasecmp (p1->name, p2->name); +} + +static gint +entity_cmp (const void *m1, const void *m2) +{ + const entity *p1 = m1; + const entity *p2 = m2; + + return g_ascii_strcasecmp (p1->name, p2->name); +} + +static gint +entity_cmp_num (const void *m1, const void *m2) +{ + const entity *p1 = m1; + const entity *p2 = m2; + + return p1->code - p2->code; +} + +static GNode * +construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) +{ + struct html_node *html; + GNode *n = NULL; + struct html_tag key, *found; + gchar t; + + if (text == NULL || *text == '\0') { + return NULL; + } + + html = rspamd_mempool_alloc0 (pool, sizeof (struct html_node)); + + /* Check whether this tag is fully closed */ + if (*(text + tag_len - 1) == '/') { + html->flags |= FL_CLOSED; + } + + /* Check xml tag */ + if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) { + html->flags |= FL_XML; + html->tag = NULL; + } + else { + if (*text == '/') { + html->flags |= FL_CLOSING; + text++; + } + + /* Find end of tag name */ + key.name = text; + while (*text && g_ascii_isalnum (*(++text))); + + t = *text; + *text = '\0'; + + /* Match tag id by tag name */ + if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) { + *text = t; + html->tag = found; + } + else { + *text = t; + return NULL; + } + } + + n = g_node_new (html); + + return n; +} + +static gboolean +check_balance (GNode * node, GNode ** cur_level) +{ + struct html_node *arg = node->data, *tmp; + GNode *cur; + + if (arg->flags & FL_CLOSING) { + /* First of all check whether this tag is closing tag for parent node */ + cur = node->parent; + while (cur && cur->data) { + tmp = cur->data; + if ((tmp->tag && arg->tag) && tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) { + tmp->flags |= FL_CLOSED; + /* Destroy current node as we find corresponding parent node */ + g_node_destroy (node); + /* Change level */ + *cur_level = cur->parent; + return TRUE; + } + cur = cur->parent; + } + } + else { + return TRUE; + } + + return FALSE; +} + +struct html_tag * +get_tag_by_name (const gchar *name) +{ + struct html_tag key; + + key.name = name; + + return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); +} + +/* Decode HTML entitles in text */ +void +decode_entitles (gchar *s, guint * len) +{ + guint l, rep_len; + gchar *t = s; /* t - tortoise */ + gchar *h = s; /* h - hare */ + gchar *e = s; + gchar *end_ptr; + gint state = 0, val, base; + entity *found, key; + + if (len == NULL || *len == 0) { + l = strlen (s); + } + else { + l = *len; + } + + while (h - s < (gint)l) { + switch (state) { + /* Out of entitle */ + case 0: + if (*h == '&') { + state = 1; + e = h; + h++; + continue; + } + else { + *t = *h; + h++; + t++; + } + break; + case 1: + if (*h == ';') { + /* Determine base */ + /* First find in entities table */ + + key.name = e + 1; + *h = '\0'; + if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) { + if (found->replacement) { + rep_len = strlen (found->replacement); + memcpy (t, found->replacement, rep_len); + t += rep_len; + } + } + else { + if (*(e + 2) == 'x' || *(e + 2) == 'X') { + base = 16; + } + else if (*(e + 2) == 'o' || *(e + 2) == 'O') { + base = 8; + } + else { + base = 10; + } + if (base == 10) { + val = strtoul ((e + 2), &end_ptr, base); + } + else { + val = strtoul ((e + 3), &end_ptr, base); + } + if (end_ptr != NULL && *end_ptr != '\0') { + /* Skip undecoded */ + t = h; + } + else { + /* Search for a replacement */ + key.code = val; + found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num); + if (found) { + if (found->replacement) { + rep_len = strlen (found->replacement); + memcpy (t, found->replacement, rep_len); + t += rep_len; + } + } + } + } + *h = ';'; + state = 0; + } + h++; + break; + } + } + *t = '\0'; + + if (len != NULL) { + *len = t - s; + } +} + +static void +check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url_text, gsize remain, tag_id_t id) +{ + struct uri *new; + gchar *url_str; + const gchar *p, *c; + gchar tagbuf[128]; + struct html_tag *tag; + gsize len = 0; + gint rc; + + p = url_text; + while (len < remain) { + if (*p == '<') { + /* Check tag name */ + if (*(p + 1) == '/') { + c = p + 2; + } + else { + c = p + 1; + } + while (len < remain) { + if (!g_ascii_isspace (*p) && *p != '>') { + p ++; + len ++; + } + else { + break; + } + } + rspamd_strlcpy (tagbuf, c, MIN ((gint)sizeof(tagbuf), p - c + 1)); + if ((tag = get_tag_by_name (tagbuf)) != NULL) { + if (tag->id == id) { + break; + } + else if (tag->id == Tag_IMG) { + /* We should ignore IMG tag here */ + while (len < remain && *p != '>' && *p != '<') { + p ++; + len ++; + } + if (*p == '>' && len < remain) { + p ++; + } + + remain -= p - url_text; + url_text = p; + len = 0; + continue; + } + } + } + len ++; + p ++; + } + + if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, TRUE) && url_str != NULL) { + new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri)); + if (new != NULL) { + g_strstrip (url_str); + rc = parse_uri (new, url_str, task->task_pool); + + if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { + if (g_ascii_strncasecmp (href_url->host, new->host, + MAX (href_url->hostlen, new->hostlen)) != 0) { + /* Special check for urls beginning with 'www' */ + if (new->hostlen > 4 && href_url->hostlen > 4) { + p = new->host; + c = NULL; + if ((p[0] == 'w' || p[0] == 'W') && + (p[1] == 'w' || p[1] == 'W') && + (p[2] == 'w' || p[2] == 'W') && + (p[3] == '.')) { + p += 4; + c = href_url->host; + len = MAX (href_url->hostlen, new->hostlen - 4); + } + else { + p = href_url->host; + if ((p[0] == 'w' || p[0] == 'W') && + (p[1] == 'w' || p[1] == 'W') && + (p[2] == 'w' || p[2] == 'W') && + (p[3] == '.')) { + p += 4; + c = new->host; + len = MAX (href_url->hostlen - 4, new->hostlen); + } + } + /* Compare parts and check for phished hostname */ + if (c != NULL) { + if (g_ascii_strncasecmp (p, c, len) != 0) { + href_url->is_phished = TRUE; + href_url->phished_url = new; + } + } + else { + href_url->is_phished = TRUE; + href_url->phished_url = new; + } + } + else { + href_url->is_phished = TRUE; + href_url->phished_url = new; + } + } + } + else { + msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); + } + } + } + +} + +static void +parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t id, + gchar *tag_text, gsize tag_len, gsize remain) +{ + gchar *c = NULL, *p, *url_text; + gint len, rc; + struct uri *url; + gboolean got_single_quote = FALSE, got_double_quote = FALSE; + + /* For A tags search for href= and for IMG tags search for src= */ + if (id == Tag_A) { + c = rspamd_strncasestr (tag_text, "href=", tag_len); + len = sizeof ("href=") - 1; + } + else if (id == Tag_IMG) { + c = rspamd_strncasestr (tag_text, "src=", tag_len); + len = sizeof ("src=") - 1; + } + + if (c != NULL) { + /* First calculate length */ + c += len; + /* Skip spaces after eqsign */ + while (g_ascii_isspace (*c)) { + c++; + } + len = 0; + p = c; + while (*p && (guint)(p - tag_text) < tag_len) { + if (got_double_quote) { + if (*p == '"') { + break; + } + else { + len++; + } + } + else if (got_single_quote) { + if (*p == '\'') { + break; + } + else { + len++; + } + } + else if (g_ascii_isspace (*p) || *p == '>' || (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') { + break; + } + else { + if (*p == '"' && !got_single_quote) { + got_double_quote = !got_double_quote; + } + else if (*p == '\'' && !got_double_quote) { + got_single_quote = !got_single_quote; + } + else { + len++; + } + } + p++; + } + + if (got_single_quote || got_double_quote) { + c++; + } + + if (len == 0) { + return; + } + + url_text = rspamd_mempool_alloc (task->task_pool, len + 1); + rspamd_strlcpy (url_text, c, len + 1); + decode_entitles (url_text, NULL); + + if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 && + g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 && + g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0 && + g_ascii_strncasecmp (url_text, "mailto:", sizeof ("mailto:") - 1) != 0) { + return; + } + + url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri)); + rc = parse_uri (url, url_text, task->task_pool); + + if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) { + /* + * Check for phishing + */ + if ((p = strchr (c, '>')) != NULL && id == Tag_A) { + p ++; + check_phishing (task, url, p, remain - (p - tag_text), id); + } + if (g_tree_lookup (task->urls, url) == NULL) { + g_tree_insert (task->urls, url, url); + } + } + } +} + +gboolean +add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_text_part *part, + gchar *tag_text, gsize tag_len, gsize remain, GNode ** cur_level) +{ + GNode *new; + struct html_node *data; + + if (!tags_sorted) { + qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); + tags_sorted = 1; + } + if (!entities_sorted) { + qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp); + memcpy (entities_defs_num, entities_defs, sizeof (entities_defs)); + qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num); + entities_sorted = 1; + } + + /* First call of this function */ + if (part->html_nodes == NULL) { + /* Insert root node */ + new = g_node_new (NULL); + *cur_level = new; + part->html_nodes = new; + rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, part->html_nodes); + /* Call once again with root node */ + return add_html_node (task, pool, part, tag_text, tag_len, remain, cur_level); + } + else { + new = construct_html_node (pool, tag_text, tag_len); + if (new == NULL) { + debug_task ("cannot construct HTML node for text '%*s'", tag_len, tag_text); + return FALSE; + } + data = new->data; + if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { + parse_tag_url (task, part, data->tag->id, tag_text, tag_len, remain); + } + + if (data->flags & FL_CLOSING) { + if (!*cur_level) { + debug_task ("bad parent node"); + return FALSE; + } + g_node_append (*cur_level, new); + if (!check_balance (new, cur_level)) { + debug_task ("mark part as unbalanced as it has not pairable closing tags"); + part->is_balanced = FALSE; + } + } + else { + + g_node_append (*cur_level, new); + if ((data->flags & FL_CLOSED) == 0) { + *cur_level = new; + } + /* Skip some tags */ + if (data->tag && (data->tag->id == Tag_STYLE || + data->tag->id == Tag_SCRIPT || + data->tag->id == Tag_OBJECT || + data->tag->id == Tag_TITLE)) { + return FALSE; + } + } + } + + return TRUE; +} + +/* + * vi:ts=4 + */ |