summaryrefslogtreecommitdiffstats
path: root/src/libserver/html.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r--src/libserver/html.c942
1 files changed, 942 insertions, 0 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
new file mode 100644
index 000000000..028c54f6c
--- /dev/null
+++ b/src/libserver/html.c
@@ -0,0 +1,942 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util.h"
+#include "main.h"
+#include "message.h"
+#include "html.h"
+#include "url.h"
+
+static sig_atomic_t tags_sorted = 0;
+
+static struct html_tag tag_defs[] = {
+ /* W3C defined elements */
+ {Tag_A, "a", (CM_INLINE)},
+ {Tag_ABBR, "abbr", (CM_INLINE)},
+ {Tag_ACRONYM, "acronym", (CM_INLINE)},
+ {Tag_ADDRESS, "address", (CM_BLOCK)},
+ {Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)},
+ {Tag_AREA, "area", (CM_BLOCK | CM_EMPTY)},
+ {Tag_B, "b", (CM_INLINE)},
+ {Tag_BASE, "base", (CM_HEAD | CM_EMPTY)},
+ {Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)},
+ {Tag_BDO, "bdo", (CM_INLINE)},
+ {Tag_BIG, "big", (CM_INLINE)},
+ {Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)},
+ {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST)},
+ {Tag_BR, "br", (CM_INLINE | CM_EMPTY)},
+ {Tag_BUTTON, "button", (CM_INLINE)},
+ {Tag_CAPTION, "caption", (CM_TABLE)},
+ {Tag_CENTER, "center", (CM_BLOCK)},
+ {Tag_CITE, "cite", (CM_INLINE)},
+ {Tag_CODE, "code", (CM_INLINE)},
+ {Tag_COL, "col", (CM_TABLE | CM_EMPTY)},
+ {Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)},
+ {Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)},
+ {Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)},
+ {Tag_DFN, "dfn", (CM_INLINE)},
+ {Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)},
+ {Tag_DIV, "div", (CM_BLOCK)},
+ {Tag_DL, "dl", (CM_BLOCK)},
+ {Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)},
+ {Tag_EM, "em", (CM_INLINE)},
+ {Tag_FIELDSET, "fieldset", (CM_BLOCK)},
+ {Tag_FONT, "font", (CM_INLINE)},
+ {Tag_FORM, "form", (CM_BLOCK)},
+ {Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY)},
+ {Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)},
+ {Tag_H1, "h1", (CM_BLOCK | CM_HEADING)},
+ {Tag_H2, "h2", (CM_BLOCK | CM_HEADING)},
+ {Tag_H3, "h3", (CM_BLOCK | CM_HEADING)},
+ {Tag_H4, "h4", (CM_BLOCK | CM_HEADING)},
+ {Tag_H5, "h5", (CM_BLOCK | CM_HEADING)},
+ {Tag_H6, "h6", (CM_BLOCK | CM_HEADING)},
+ {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST)},
+ {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)},
+ {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST)},
+ {Tag_I, "i", (CM_INLINE)},
+ {Tag_IFRAME, "iframe", (CM_INLINE)},
+ {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)},
+ {Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)},
+ {Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)},
+ {Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)},
+ {Tag_KBD, "kbd", (CM_INLINE)},
+ {Tag_LABEL, "label", (CM_INLINE)},
+ {Tag_LEGEND, "legend", (CM_INLINE)},
+ {Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT)},
+ {Tag_LINK, "link", (CM_HEAD | CM_EMPTY)},
+ {Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)},
+ {Tag_MAP, "map", (CM_INLINE)},
+ {Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)},
+ {Tag_META, "meta", (CM_HEAD | CM_EMPTY)},
+ {Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)},
+ {Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)},
+ {Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)},
+ {Tag_OL, "ol", (CM_BLOCK)},
+ {Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)},
+ {Tag_OPTION, "option", (CM_FIELD | CM_OPT)},
+ {Tag_P, "p", (CM_BLOCK | CM_OPT)},
+ {Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)},
+ {Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)},
+ {Tag_PRE, "pre", (CM_BLOCK)},
+ {Tag_Q, "q", (CM_INLINE)},
+ {Tag_RB, "rb", (CM_INLINE)},
+ {Tag_RBC, "rbc", (CM_INLINE)},
+ {Tag_RP, "rp", (CM_INLINE)},
+ {Tag_RT, "rt", (CM_INLINE)},
+ {Tag_RTC, "rtc", (CM_INLINE)},
+ {Tag_RUBY, "ruby", (CM_INLINE)},
+ {Tag_S, "s", (CM_INLINE)},
+ {Tag_SAMP, "samp", (CM_INLINE)},
+ {Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)},
+ {Tag_SELECT, "select", (CM_INLINE | CM_FIELD)},
+ {Tag_SMALL, "small", (CM_INLINE)},
+ {Tag_SPAN, "span", (CM_INLINE)},
+ {Tag_STRIKE, "strike", (CM_INLINE)},
+ {Tag_STRONG, "strong", (CM_INLINE)},
+ {Tag_STYLE, "style", (CM_HEAD)},
+ {Tag_SUB, "sub", (CM_INLINE)},
+ {Tag_SUP, "sup", (CM_INLINE)},
+ {Tag_TABLE, "table", (CM_BLOCK)},
+ {Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT)},
+ {Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT)},
+ {Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)},
+ {Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)},
+ {Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT)},
+ {Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)},
+ {Tag_TITLE, "title", (CM_HEAD)},
+ {Tag_TR, "tr", (CM_TABLE | CM_OPT)},
+ {Tag_TT, "tt", (CM_INLINE)},
+ {Tag_U, "u", (CM_INLINE)},
+ {Tag_UL, "ul", (CM_BLOCK)},
+ {Tag_VAR, "var", (CM_INLINE)},
+ {Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)},
+ {Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)},
+
+ /* proprietary elements */
+ {Tag_ALIGN, "align", (CM_BLOCK)},
+ {Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)},
+ {Tag_BLINK, "blink", (CM_INLINE)},
+ {Tag_COMMENT, "comment", (CM_INLINE)},
+ {Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)},
+ {Tag_ILAYER, "ilayer", (CM_INLINE)},
+ {Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)},
+ {Tag_LAYER, "layer", (CM_BLOCK)},
+ {Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)},
+ {Tag_MULTICOL, "multicol", (CM_BLOCK)},
+ {Tag_NOBR, "nobr", (CM_INLINE)},
+ {Tag_NOEMBED, "noembed", (CM_INLINE)},
+ {Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)},
+ {Tag_NOSAVE, "nosave", (CM_BLOCK)},
+ {Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)},
+ {Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)},
+ {Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)},
+ {Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)},
+};
+
+static sig_atomic_t entities_sorted = 0;
+struct _entity;
+typedef struct _entity entity;
+
+struct _entity {
+ gchar *name;
+ uint code;
+ gchar *replacement;
+};
+
+
+static entity entities_defs[] = {
+ /*
+ ** Markup pre-defined character entities
+ */
+ {"quot", 34, "\""},
+ {"amp", 38, "&"},
+ {"apos", 39, "'"},
+ {"lt", 60, "<"},
+ {"gt", 62, ">"},
+
+ /*
+ ** Latin-1 character entities
+ */
+ {"nbsp", 160, " "},
+ {"iexcl", 161, "!"},
+ {"cent", 162, "cent"},
+ {"pound", 163, "pound"},
+ {"curren", 164, "current"},
+ {"yen", 165, "yen"},
+ {"brvbar", 166, NULL},
+ {"sect", 167, NULL},
+ {"uml", 168, "uml"},
+ {"copy", 169, "c"},
+ {"ordf", 170, NULL},
+ {"laquo", 171, "\""},
+ {"not", 172, "!"},
+ {"shy", 173, NULL},
+ {"reg", 174, "r"},
+ {"macr", 175, NULL},
+ {"deg", 176, "deg"},
+ {"plusmn", 177, "+-"},
+ {"sup2", 178, "2"},
+ {"sup3", 179, "3"},
+ {"acute", 180, NULL},
+ {"micro", 181, NULL},
+ {"para", 182, NULL},
+ {"middot", 183, "."},
+ {"cedil", 184, NULL},
+ {"sup1", 185, "1"},
+ {"ordm", 186, NULL},
+ {"raquo", 187, "\""},
+ {"frac14", 188, "1/4"},
+ {"frac12", 189, "1/2"},
+ {"frac34", 190, "3/4"},
+ {"iquest", 191, "i"},
+ {"Agrave", 192, "a"},
+ {"Aacute", 193, "a"},
+ {"Acirc", 194, "a"},
+ {"Atilde", 195, "a"},
+ {"Auml", 196, "a"},
+ {"Aring", 197, "a"},
+ {"AElig", 198, "a"},
+ {"Ccedil", 199, "c"},
+ {"Egrave", 200, "e"},
+ {"Eacute", 201, "e"},
+ {"Ecirc", 202, "e"},
+ {"Euml", 203, "e"},
+ {"Igrave", 204, "i"},
+ {"Iacute", 205, "i"},
+ {"Icirc", 206, "i"},
+ {"Iuml", 207, "i"},
+ {"ETH", 208, "e"},
+ {"Ntilde", 209, "n"},
+ {"Ograve", 210, "o"},
+ {"Oacute", 211, "o"},
+ {"Ocirc", 212, "o"},
+ {"Otilde", 213, "o"},
+ {"Ouml", 214, "o"},
+ {"times", 215, "t"},
+ {"Oslash", 216, "o"},
+ {"Ugrave", 217, "u"},
+ {"Uacute", 218, "u"},
+ {"Ucirc", 219, "u"},
+ {"Uuml", 220, "u"},
+ {"Yacute", 221, "y"},
+ {"THORN", 222, "t"},
+ {"szlig", 223, "s"},
+ {"agrave", 224, "a"},
+ {"aacute", 225, "a"},
+ {"acirc", 226, "a"},
+ {"atilde", 227, "a"},
+ {"auml", 228, "a"},
+ {"aring", 229, "a"},
+ {"aelig", 230, "a"},
+ {"ccedil", 231, "c"},
+ {"egrave", 232, "e"},
+ {"eacute", 233, "e"},
+ {"ecirc", 234, "e"},
+ {"euml", 235, "e"},
+ {"igrave", 236, "e"},
+ {"iacute", 237, "e"},
+ {"icirc", 238, "e"},
+ {"iuml", 239, "e"},
+ {"eth", 240, "e"},
+ {"ntilde", 241, "n"},
+ {"ograve", 242, "o"},
+ {"oacute", 243, "o"},
+ {"ocirc", 244, "o"},
+ {"otilde", 245, "o"},
+ {"ouml", 246, "o"},
+ {"divide", 247, "/"},
+ {"oslash", 248, "/"},
+ {"ugrave", 249, "u"},
+ {"uacute", 250, "u"},
+ {"ucirc", 251, "u"},
+ {"uuml", 252, "u"},
+ {"yacute", 253, "y"},
+ {"thorn", 254, "t"},
+ {"yuml", 255, "y"},
+
+ /*
+ ** Extended Entities defined in HTML 4: Symbols
+ */
+ {"fnof", 402, "f"},
+ {"Alpha", 913, "alpha"},
+ {"Beta", 914, "beta"},
+ {"Gamma", 915, "gamma"},
+ {"Delta", 916, "delta"},
+ {"Epsilon", 917, "epsilon"},
+ {"Zeta", 918, "zeta"},
+ {"Eta", 919, "eta"},
+ {"Theta", 920, "theta"},
+ {"Iota", 921, "iota"},
+ {"Kappa", 922, "kappa"},
+ {"Lambda", 923, "lambda"},
+ {"Mu", 924, "mu"},
+ {"Nu", 925, "nu"},
+ {"Xi", 926, "xi"},
+ {"Omicron", 927, "omicron"},
+ {"Pi", 928, "pi"},
+ {"Rho", 929, "rho"},
+ {"Sigma", 931, "sigma"},
+ {"Tau", 932, "tau"},
+ {"Upsilon", 933, "upsilon"},
+ {"Phi", 934, "phi"},
+ {"Chi", 935, "chi"},
+ {"Psi", 936, "psi"},
+ {"Omega", 937, "omega"},
+ {"alpha", 945, "alpha"},
+ {"beta", 946, "beta"},
+ {"gamma", 947, "gamma"},
+ {"delta", 948, "delta"},
+ {"epsilon", 949, "epsilon"},
+ {"zeta", 950, "zeta"},
+ {"eta", 951, "eta"},
+ {"theta", 952, "theta"},
+ {"iota", 953, "iota"},
+ {"kappa", 954, "kappa"},
+ {"lambda", 955, "lambda"},
+ {"mu", 956, "mu"},
+ {"nu", 957, "nu"},
+ {"xi", 958, "xi"},
+ {"omicron", 959, "omicron"},
+ {"pi", 960, "pi"},
+ {"rho", 961, "rho"},
+ {"sigmaf", 962, "sigmaf"},
+ {"sigma", 963, "sigma"},
+ {"tau", 964, "tau"},
+ {"upsilon", 965, "upsilon"},
+ {"phi", 966, "phi"},
+ {"chi", 967, "chi"},
+ {"psi", 968, "psi"},
+ {"omega", 969, "omega"},
+ {"thetasym", 977, "thetasym"},
+ {"upsih", 978, "upsih"},
+ {"piv", 982, "piv"},
+ {"bull", 8226, "bull"},
+ {"hellip", 8230, "..."},
+ {"prime", 8242, "'"},
+ {"Prime", 8243, "'"},
+ {"oline", 8254, "-"},
+ {"frasl", 8260, NULL},
+ {"weierp", 8472, NULL},
+ {"image", 8465, NULL},
+ {"real", 8476, NULL},
+ {"trade", 8482, NULL},
+ {"alefsym", 8501, "a"},
+ {"larr", 8592, NULL},
+ {"uarr", 8593, NULL},
+ {"rarr", 8594, NULL},
+ {"darr", 8595, NULL},
+ {"harr", 8596, NULL},
+ {"crarr", 8629, NULL},
+ {"lArr", 8656, NULL},
+ {"uArr", 8657, NULL},
+ {"rArr", 8658, NULL},
+ {"dArr", 8659, NULL},
+ {"hArr", 8660, NULL},
+ {"forall", 8704, NULL},
+ {"part", 8706, NULL},
+ {"exist", 8707, NULL},
+ {"empty", 8709, NULL},
+ {"nabla", 8711, NULL},
+ {"isin", 8712, NULL},
+ {"notin", 8713, NULL},
+ {"ni", 8715, NULL},
+ {"prod", 8719, NULL},
+ {"sum", 8721, "E"},
+ {"minus", 8722, "-"},
+ {"lowast", 8727, NULL},
+ {"radic", 8730, NULL},
+ {"prop", 8733, NULL},
+ {"infin", 8734, NULL},
+ {"ang", 8736, "'"},
+ {"and", 8743, "&"},
+ {"or", 8744, "|"},
+ {"cap", 8745, NULL},
+ {"cup", 8746, NULL},
+ {"gint", 8747, NULL},
+ {"there4", 8756, NULL},
+ {"sim", 8764, NULL},
+ {"cong", 8773, NULL},
+ {"asymp", 8776, NULL},
+ {"ne", 8800, "!="},
+ {"equiv", 8801, "=="},
+ {"le", 8804, "<="},
+ {"ge", 8805, ">="},
+ {"sub", 8834, NULL},
+ {"sup", 8835, NULL},
+ {"nsub", 8836, NULL},
+ {"sube", 8838, NULL},
+ {"supe", 8839, NULL},
+ {"oplus", 8853, NULL},
+ {"otimes", 8855, NULL},
+ {"perp", 8869, NULL},
+ {"sdot", 8901, NULL},
+ {"lceil", 8968, NULL},
+ {"rceil", 8969, NULL},
+ {"lfloor", 8970, NULL},
+ {"rfloor", 8971, NULL},
+ {"lang", 9001, NULL},
+ {"rang", 9002, NULL},
+ {"loz", 9674, NULL},
+ {"spades", 9824, NULL},
+ {"clubs", 9827, NULL},
+ {"hearts", 9829, NULL},
+ {"diams", 9830, NULL},
+
+ /*
+ ** Extended Entities defined in HTML 4: Special (less Markup at top)
+ */
+ {"OElig", 338, NULL},
+ {"oelig", 339, NULL},
+ {"Scaron", 352, NULL},
+ {"scaron", 353, NULL},
+ {"Yuml", 376, NULL},
+ {"circ", 710, NULL},
+ {"tilde", 732, NULL},
+ {"ensp", 8194, NULL},
+ {"emsp", 8195, NULL},
+ {"thinsp", 8201, NULL},
+ {"zwnj", 8204, NULL},
+ {"zwj", 8205, NULL},
+ {"lrm", 8206, NULL},
+ {"rlm", 8207, NULL},
+ {"ndash", 8211, "-"},
+ {"mdash", 8212, "-"},
+ {"lsquo", 8216, "'"},
+ {"rsquo", 8217, "'"},
+ {"sbquo", 8218, "\""},
+ {"ldquo", 8220, "\""},
+ {"rdquo", 8221, "\""},
+ {"bdquo", 8222, "\""},
+ {"dagger", 8224, "T"},
+ {"Dagger", 8225, "T"},
+ {"permil", 8240, NULL},
+ {"lsaquo", 8249, "\""},
+ {"rsaquo", 8250, "\""},
+ {"euro", 8364, "E"},
+};
+
+static entity entities_defs_num[ (G_N_ELEMENTS (entities_defs)) ];
+
+static gint
+tag_cmp (const void *m1, const void *m2)
+{
+ const struct html_tag *p1 = m1;
+ const struct html_tag *p2 = m2;
+
+ return g_ascii_strcasecmp (p1->name, p2->name);
+}
+
+static gint
+entity_cmp (const void *m1, const void *m2)
+{
+ const entity *p1 = m1;
+ const entity *p2 = m2;
+
+ return g_ascii_strcasecmp (p1->name, p2->name);
+}
+
+static gint
+entity_cmp_num (const void *m1, const void *m2)
+{
+ const entity *p1 = m1;
+ const entity *p2 = m2;
+
+ return p1->code - p2->code;
+}
+
+static GNode *
+construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len)
+{
+ struct html_node *html;
+ GNode *n = NULL;
+ struct html_tag key, *found;
+ gchar t;
+
+ if (text == NULL || *text == '\0') {
+ return NULL;
+ }
+
+ html = rspamd_mempool_alloc0 (pool, sizeof (struct html_node));
+
+ /* Check whether this tag is fully closed */
+ if (*(text + tag_len - 1) == '/') {
+ html->flags |= FL_CLOSED;
+ }
+
+ /* Check xml tag */
+ if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) {
+ html->flags |= FL_XML;
+ html->tag = NULL;
+ }
+ else {
+ if (*text == '/') {
+ html->flags |= FL_CLOSING;
+ text++;
+ }
+
+ /* Find end of tag name */
+ key.name = text;
+ while (*text && g_ascii_isalnum (*(++text)));
+
+ t = *text;
+ *text = '\0';
+
+ /* Match tag id by tag name */
+ if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) {
+ *text = t;
+ html->tag = found;
+ }
+ else {
+ *text = t;
+ return NULL;
+ }
+ }
+
+ n = g_node_new (html);
+
+ return n;
+}
+
+static gboolean
+check_balance (GNode * node, GNode ** cur_level)
+{
+ struct html_node *arg = node->data, *tmp;
+ GNode *cur;
+
+ if (arg->flags & FL_CLOSING) {
+ /* First of all check whether this tag is closing tag for parent node */
+ cur = node->parent;
+ while (cur && cur->data) {
+ tmp = cur->data;
+ if ((tmp->tag && arg->tag) && tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) {
+ tmp->flags |= FL_CLOSED;
+ /* Destroy current node as we find corresponding parent node */
+ g_node_destroy (node);
+ /* Change level */
+ *cur_level = cur->parent;
+ return TRUE;
+ }
+ cur = cur->parent;
+ }
+ }
+ else {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+struct html_tag *
+get_tag_by_name (const gchar *name)
+{
+ struct html_tag key;
+
+ key.name = name;
+
+ return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+}
+
+/* Decode HTML entitles in text */
+void
+decode_entitles (gchar *s, guint * len)
+{
+ guint l, rep_len;
+ gchar *t = s; /* t - tortoise */
+ gchar *h = s; /* h - hare */
+ gchar *e = s;
+ gchar *end_ptr;
+ gint state = 0, val, base;
+ entity *found, key;
+
+ if (len == NULL || *len == 0) {
+ l = strlen (s);
+ }
+ else {
+ l = *len;
+ }
+
+ while (h - s < (gint)l) {
+ switch (state) {
+ /* Out of entitle */
+ case 0:
+ if (*h == '&') {
+ state = 1;
+ e = h;
+ h++;
+ continue;
+ }
+ else {
+ *t = *h;
+ h++;
+ t++;
+ }
+ break;
+ case 1:
+ if (*h == ';') {
+ /* Determine base */
+ /* First find in entities table */
+
+ key.name = e + 1;
+ *h = '\0';
+ if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) {
+ if (found->replacement) {
+ rep_len = strlen (found->replacement);
+ memcpy (t, found->replacement, rep_len);
+ t += rep_len;
+ }
+ }
+ else {
+ if (*(e + 2) == 'x' || *(e + 2) == 'X') {
+ base = 16;
+ }
+ else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
+ base = 8;
+ }
+ else {
+ base = 10;
+ }
+ if (base == 10) {
+ val = strtoul ((e + 2), &end_ptr, base);
+ }
+ else {
+ val = strtoul ((e + 3), &end_ptr, base);
+ }
+ if (end_ptr != NULL && *end_ptr != '\0') {
+ /* Skip undecoded */
+ t = h;
+ }
+ else {
+ /* Search for a replacement */
+ key.code = val;
+ found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
+ if (found) {
+ if (found->replacement) {
+ rep_len = strlen (found->replacement);
+ memcpy (t, found->replacement, rep_len);
+ t += rep_len;
+ }
+ }
+ }
+ }
+ *h = ';';
+ state = 0;
+ }
+ h++;
+ break;
+ }
+ }
+ *t = '\0';
+
+ if (len != NULL) {
+ *len = t - s;
+ }
+}
+
+static void
+check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url_text, gsize remain, tag_id_t id)
+{
+ struct uri *new;
+ gchar *url_str;
+ const gchar *p, *c;
+ gchar tagbuf[128];
+ struct html_tag *tag;
+ gsize len = 0;
+ gint rc;
+
+ p = url_text;
+ while (len < remain) {
+ if (*p == '<') {
+ /* Check tag name */
+ if (*(p + 1) == '/') {
+ c = p + 2;
+ }
+ else {
+ c = p + 1;
+ }
+ while (len < remain) {
+ if (!g_ascii_isspace (*p) && *p != '>') {
+ p ++;
+ len ++;
+ }
+ else {
+ break;
+ }
+ }
+ rspamd_strlcpy (tagbuf, c, MIN ((gint)sizeof(tagbuf), p - c + 1));
+ if ((tag = get_tag_by_name (tagbuf)) != NULL) {
+ if (tag->id == id) {
+ break;
+ }
+ else if (tag->id == Tag_IMG) {
+ /* We should ignore IMG tag here */
+ while (len < remain && *p != '>' && *p != '<') {
+ p ++;
+ len ++;
+ }
+ if (*p == '>' && len < remain) {
+ p ++;
+ }
+
+ remain -= p - url_text;
+ url_text = p;
+ len = 0;
+ continue;
+ }
+ }
+ }
+ len ++;
+ p ++;
+ }
+
+ if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, TRUE) && url_str != NULL) {
+ new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri));
+ if (new != NULL) {
+ g_strstrip (url_str);
+ rc = parse_uri (new, url_str, task->task_pool);
+
+ if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+ if (g_ascii_strncasecmp (href_url->host, new->host,
+ MAX (href_url->hostlen, new->hostlen)) != 0) {
+ /* Special check for urls beginning with 'www' */
+ if (new->hostlen > 4 && href_url->hostlen > 4) {
+ p = new->host;
+ c = NULL;
+ if ((p[0] == 'w' || p[0] == 'W') &&
+ (p[1] == 'w' || p[1] == 'W') &&
+ (p[2] == 'w' || p[2] == 'W') &&
+ (p[3] == '.')) {
+ p += 4;
+ c = href_url->host;
+ len = MAX (href_url->hostlen, new->hostlen - 4);
+ }
+ else {
+ p = href_url->host;
+ if ((p[0] == 'w' || p[0] == 'W') &&
+ (p[1] == 'w' || p[1] == 'W') &&
+ (p[2] == 'w' || p[2] == 'W') &&
+ (p[3] == '.')) {
+ p += 4;
+ c = new->host;
+ len = MAX (href_url->hostlen - 4, new->hostlen);
+ }
+ }
+ /* Compare parts and check for phished hostname */
+ if (c != NULL) {
+ if (g_ascii_strncasecmp (p, c, len) != 0) {
+ href_url->is_phished = TRUE;
+ href_url->phished_url = new;
+ }
+ }
+ else {
+ href_url->is_phished = TRUE;
+ href_url->phished_url = new;
+ }
+ }
+ else {
+ href_url->is_phished = TRUE;
+ href_url->phished_url = new;
+ }
+ }
+ }
+ else {
+ msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
+ }
+ }
+ }
+
+}
+
+static void
+parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t id,
+ gchar *tag_text, gsize tag_len, gsize remain)
+{
+ gchar *c = NULL, *p, *url_text;
+ gint len, rc;
+ struct uri *url;
+ gboolean got_single_quote = FALSE, got_double_quote = FALSE;
+
+ /* For A tags search for href= and for IMG tags search for src= */
+ if (id == Tag_A) {
+ c = rspamd_strncasestr (tag_text, "href=", tag_len);
+ len = sizeof ("href=") - 1;
+ }
+ else if (id == Tag_IMG) {
+ c = rspamd_strncasestr (tag_text, "src=", tag_len);
+ len = sizeof ("src=") - 1;
+ }
+
+ if (c != NULL) {
+ /* First calculate length */
+ c += len;
+ /* Skip spaces after eqsign */
+ while (g_ascii_isspace (*c)) {
+ c++;
+ }
+ len = 0;
+ p = c;
+ while (*p && (guint)(p - tag_text) < tag_len) {
+ if (got_double_quote) {
+ if (*p == '"') {
+ break;
+ }
+ else {
+ len++;
+ }
+ }
+ else if (got_single_quote) {
+ if (*p == '\'') {
+ break;
+ }
+ else {
+ len++;
+ }
+ }
+ else if (g_ascii_isspace (*p) || *p == '>' || (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') {
+ break;
+ }
+ else {
+ if (*p == '"' && !got_single_quote) {
+ got_double_quote = !got_double_quote;
+ }
+ else if (*p == '\'' && !got_double_quote) {
+ got_single_quote = !got_single_quote;
+ }
+ else {
+ len++;
+ }
+ }
+ p++;
+ }
+
+ if (got_single_quote || got_double_quote) {
+ c++;
+ }
+
+ if (len == 0) {
+ return;
+ }
+
+ url_text = rspamd_mempool_alloc (task->task_pool, len + 1);
+ rspamd_strlcpy (url_text, c, len + 1);
+ decode_entitles (url_text, NULL);
+
+ if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "mailto:", sizeof ("mailto:") - 1) != 0) {
+ return;
+ }
+
+ url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri));
+ rc = parse_uri (url, url_text, task->task_pool);
+
+ if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) {
+ /*
+ * Check for phishing
+ */
+ if ((p = strchr (c, '>')) != NULL && id == Tag_A) {
+ p ++;
+ check_phishing (task, url, p, remain - (p - tag_text), id);
+ }
+ if (g_tree_lookup (task->urls, url) == NULL) {
+ g_tree_insert (task->urls, url, url);
+ }
+ }
+ }
+}
+
+gboolean
+add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_text_part *part,
+ gchar *tag_text, gsize tag_len, gsize remain, GNode ** cur_level)
+{
+ GNode *new;
+ struct html_node *data;
+
+ if (!tags_sorted) {
+ qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+ tags_sorted = 1;
+ }
+ if (!entities_sorted) {
+ qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp);
+ memcpy (entities_defs_num, entities_defs, sizeof (entities_defs));
+ qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
+ entities_sorted = 1;
+ }
+
+ /* First call of this function */
+ if (part->html_nodes == NULL) {
+ /* Insert root node */
+ new = g_node_new (NULL);
+ *cur_level = new;
+ part->html_nodes = new;
+ rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, part->html_nodes);
+ /* Call once again with root node */
+ return add_html_node (task, pool, part, tag_text, tag_len, remain, cur_level);
+ }
+ else {
+ new = construct_html_node (pool, tag_text, tag_len);
+ if (new == NULL) {
+ debug_task ("cannot construct HTML node for text '%*s'", tag_len, tag_text);
+ return FALSE;
+ }
+ data = new->data;
+ if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
+ parse_tag_url (task, part, data->tag->id, tag_text, tag_len, remain);
+ }
+
+ if (data->flags & FL_CLOSING) {
+ if (!*cur_level) {
+ debug_task ("bad parent node");
+ return FALSE;
+ }
+ g_node_append (*cur_level, new);
+ if (!check_balance (new, cur_level)) {
+ debug_task ("mark part as unbalanced as it has not pairable closing tags");
+ part->is_balanced = FALSE;
+ }
+ }
+ else {
+
+ g_node_append (*cur_level, new);
+ if ((data->flags & FL_CLOSED) == 0) {
+ *cur_level = new;
+ }
+ /* Skip some tags */
+ if (data->tag && (data->tag->id == Tag_STYLE ||
+ data->tag->id == Tag_SCRIPT ||
+ data->tag->id == Tag_OBJECT ||
+ data->tag->id == Tag_TITLE)) {
+ return FALSE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */