summaryrefslogtreecommitdiffstats
path: root/src/libserver/html.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-07-23 12:57:31 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-07-23 12:57:31 +0100
commit379055dbbb4af997b4d3ffb161d447872d7ca357 (patch)
tree3774553d470f93e12ddeb454aad9b3b607cf8918 /src/libserver/html.c
parent602ae7a0b7e215ba2677131b8fdc70abc156b3ca (diff)
downloadrspamd-379055dbbb4af997b4d3ffb161d447872d7ca357.tar.gz
rspamd-379055dbbb4af997b4d3ffb161d447872d7ca357.zip
Unify style without sorting headers.
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r--src/libserver/html.c245
1 files changed, 149 insertions, 96 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 028c54f6c..539ff555d 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -29,9 +29,9 @@
#include "html.h"
#include "url.h"
-static sig_atomic_t tags_sorted = 0;
+static sig_atomic_t tags_sorted = 0;
-static struct html_tag tag_defs[] = {
+static struct html_tag tag_defs[] = {
/* W3C defined elements */
{Tag_A, "a", (CM_INLINE)},
{Tag_ABBR, "abbr", (CM_INLINE)},
@@ -93,7 +93,8 @@ static struct html_tag tag_defs[] = {
{Tag_META, "meta", (CM_HEAD | CM_EMPTY)},
{Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)},
{Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)},
- {Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)},
+ {Tag_OBJECT, "object",
+ (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)},
{Tag_OL, "ol", (CM_BLOCK)},
{Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)},
{Tag_OPTION, "option", (CM_FIELD | CM_OPT)},
@@ -156,21 +157,21 @@ static struct html_tag tag_defs[] = {
{Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)},
};
-static sig_atomic_t entities_sorted = 0;
+static sig_atomic_t entities_sorted = 0;
struct _entity;
-typedef struct _entity entity;
+typedef struct _entity entity;
struct _entity {
- gchar *name;
- uint code;
- gchar *replacement;
+ gchar *name;
+ uint code;
+ gchar *replacement;
};
-static entity entities_defs[] = {
+static entity entities_defs[] = {
/*
- ** Markup pre-defined character entities
- */
+ ** Markup pre-defined character entities
+ */
{"quot", 34, "\""},
{"amp", 38, "&"},
{"apos", 39, "'"},
@@ -178,8 +179,8 @@ static entity entities_defs[] = {
{"gt", 62, ">"},
/*
- ** Latin-1 character entities
- */
+ ** Latin-1 character entities
+ */
{"nbsp", 160, " "},
{"iexcl", 161, "!"},
{"cent", 162, "cent"},
@@ -278,8 +279,8 @@ static entity entities_defs[] = {
{"yuml", 255, "y"},
/*
- ** Extended Entities defined in HTML 4: Symbols
- */
+ ** Extended Entities defined in HTML 4: Symbols
+ */
{"fnof", 402, "f"},
{"Alpha", 913, "alpha"},
{"Beta", 914, "beta"},
@@ -406,8 +407,8 @@ static entity entities_defs[] = {
{"diams", 9830, NULL},
/*
- ** Extended Entities defined in HTML 4: Special (less Markup at top)
- */
+ ** Extended Entities defined in HTML 4: Special (less Markup at top)
+ */
{"OElig", 338, NULL},
{"oelig", 339, NULL},
{"Scaron", 352, NULL},
@@ -443,8 +444,8 @@ static entity entities_defs_num[ (G_N_ELEMENTS (entities_defs)) ];
static gint
tag_cmp (const void *m1, const void *m2)
{
- const struct html_tag *p1 = m1;
- const struct html_tag *p2 = m2;
+ const struct html_tag *p1 = m1;
+ const struct html_tag *p2 = m2;
return g_ascii_strcasecmp (p1->name, p2->name);
}
@@ -452,8 +453,8 @@ tag_cmp (const void *m1, const void *m2)
static gint
entity_cmp (const void *m1, const void *m2)
{
- const entity *p1 = m1;
- const entity *p2 = m2;
+ const entity *p1 = m1;
+ const entity *p2 = m2;
return g_ascii_strcasecmp (p1->name, p2->name);
}
@@ -461,19 +462,19 @@ entity_cmp (const void *m1, const void *m2)
static gint
entity_cmp_num (const void *m1, const void *m2)
{
- const entity *p1 = m1;
- const entity *p2 = m2;
+ const entity *p1 = m1;
+ const entity *p2 = m2;
return p1->code - p2->code;
}
-static GNode *
+static GNode *
construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len)
{
- struct html_node *html;
- GNode *n = NULL;
- struct html_tag key, *found;
- gchar t;
+ struct html_node *html;
+ GNode *n = NULL;
+ struct html_tag key, *found;
+ gchar t;
if (text == NULL || *text == '\0') {
return NULL;
@@ -487,7 +488,8 @@ construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len)
}
/* Check xml tag */
- if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) {
+ if (*text == '?' &&
+ g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) {
html->flags |= FL_XML;
html->tag = NULL;
}
@@ -499,13 +501,15 @@ construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len)
/* Find end of tag name */
key.name = text;
- while (*text && g_ascii_isalnum (*(++text)));
+ while (*text && g_ascii_isalnum (*(++text))) ;
t = *text;
*text = '\0';
/* Match tag id by tag name */
- if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) {
+ if ((found =
+ bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs),
+ sizeof (struct html_tag), tag_cmp)) != NULL) {
*text = t;
html->tag = found;
}
@@ -520,18 +524,20 @@ construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len)
return n;
}
-static gboolean
+static gboolean
check_balance (GNode * node, GNode ** cur_level)
{
- struct html_node *arg = node->data, *tmp;
- GNode *cur;
+ struct html_node *arg = node->data, *tmp;
+ GNode *cur;
if (arg->flags & FL_CLOSING) {
/* First of all check whether this tag is closing tag for parent node */
cur = node->parent;
while (cur && cur->data) {
tmp = cur->data;
- if ((tmp->tag && arg->tag) && tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) {
+ if ((tmp->tag &&
+ arg->tag) && tmp->tag->id == arg->tag->id &&
+ (tmp->flags & FL_CLOSED) == 0) {
tmp->flags |= FL_CLOSED;
/* Destroy current node as we find corresponding parent node */
g_node_destroy (node);
@@ -549,27 +555,28 @@ check_balance (GNode * node, GNode ** cur_level)
return FALSE;
}
-struct html_tag *
+struct html_tag *
get_tag_by_name (const gchar *name)
{
- struct html_tag key;
+ struct html_tag key;
key.name = name;
- return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+ return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs),
+ sizeof (struct html_tag), tag_cmp);
}
/* Decode HTML entitles in text */
void
decode_entitles (gchar *s, guint * len)
{
- guint l, rep_len;
- gchar *t = s; /* t - tortoise */
- gchar *h = s; /* h - hare */
- gchar *e = s;
- gchar *end_ptr;
- gint state = 0, val, base;
- entity *found, key;
+ guint l, rep_len;
+ gchar *t = s; /* t - tortoise */
+ gchar *h = s; /* h - hare */
+ gchar *e = s;
+ gchar *end_ptr;
+ gint state = 0, val, base;
+ entity *found, key;
if (len == NULL || *len == 0) {
l = strlen (s);
@@ -580,7 +587,7 @@ decode_entitles (gchar *s, guint * len)
while (h - s < (gint)l) {
switch (state) {
- /* Out of entitle */
+ /* Out of entitle */
case 0:
if (*h == '&') {
state = 1;
@@ -601,7 +608,10 @@ decode_entitles (gchar *s, guint * len)
key.name = e + 1;
*h = '\0';
- if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) {
+ if (*(e + 1) != '#' &&
+ (found =
+ bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs),
+ sizeof (entity), entity_cmp)) != NULL) {
if (found->replacement) {
rep_len = strlen (found->replacement);
memcpy (t, found->replacement, rep_len);
@@ -631,7 +641,10 @@ decode_entitles (gchar *s, guint * len)
else {
/* Search for a replacement */
key.code = val;
- found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
+ found =
+ bsearch (&key, entities_defs_num, G_N_ELEMENTS (
+ entities_defs), sizeof (entity),
+ entity_cmp_num);
if (found) {
if (found->replacement) {
rep_len = strlen (found->replacement);
@@ -656,15 +669,19 @@ decode_entitles (gchar *s, guint * len)
}
static void
-check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url_text, gsize remain, tag_id_t id)
+check_phishing (struct rspamd_task *task,
+ struct uri *href_url,
+ const gchar *url_text,
+ gsize remain,
+ tag_id_t id)
{
- struct uri *new;
- gchar *url_str;
- const gchar *p, *c;
- gchar tagbuf[128];
- struct html_tag *tag;
- gsize len = 0;
- gint rc;
+ struct uri *new;
+ gchar *url_str;
+ const gchar *p, *c;
+ gchar tagbuf[128];
+ struct html_tag *tag;
+ gsize len = 0;
+ gint rc;
p = url_text;
while (len < remain) {
@@ -678,8 +695,8 @@ check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url
}
while (len < remain) {
if (!g_ascii_isspace (*p) && *p != '>') {
- p ++;
- len ++;
+ p++;
+ len++;
}
else {
break;
@@ -693,11 +710,11 @@ check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url
else if (tag->id == Tag_IMG) {
/* We should ignore IMG tag here */
while (len < remain && *p != '>' && *p != '<') {
- p ++;
- len ++;
+ p++;
+ len++;
}
if (*p == '>' && len < remain) {
- p ++;
+ p++;
}
remain -= p - url_text;
@@ -707,19 +724,21 @@ check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url
}
}
}
- len ++;
- p ++;
+ len++;
+ p++;
}
- if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, TRUE) && url_str != NULL) {
+ if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str,
+ TRUE) && url_str != NULL) {
new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri));
if (new != NULL) {
g_strstrip (url_str);
rc = parse_uri (new, url_str, task->task_pool);
- if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+ if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc ==
+ URI_ERRNO_NO_HOST_SLASH) {
if (g_ascii_strncasecmp (href_url->host, new->host,
- MAX (href_url->hostlen, new->hostlen)) != 0) {
+ MAX (href_url->hostlen, new->hostlen)) != 0) {
/* Special check for urls beginning with 'www' */
if (new->hostlen > 4 && href_url->hostlen > 4) {
p = new->host;
@@ -762,7 +781,9 @@ check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url
}
}
else {
- msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
+ msg_info ("extract of url '%s' failed: %s",
+ url_str,
+ url_strerror (rc));
}
}
}
@@ -770,13 +791,17 @@ check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url
}
static void
-parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t id,
- gchar *tag_text, gsize tag_len, gsize remain)
+parse_tag_url (struct rspamd_task *task,
+ struct mime_text_part *part,
+ tag_id_t id,
+ gchar *tag_text,
+ gsize tag_len,
+ gsize remain)
{
- gchar *c = NULL, *p, *url_text;
- gint len, rc;
- struct uri *url;
- gboolean got_single_quote = FALSE, got_double_quote = FALSE;
+ gchar *c = NULL, *p, *url_text;
+ gint len, rc;
+ struct uri *url;
+ gboolean got_single_quote = FALSE, got_double_quote = FALSE;
/* For A tags search for href= and for IMG tags search for src= */
if (id == Tag_A) {
@@ -814,7 +839,8 @@ parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t i
len++;
}
}
- else if (g_ascii_isspace (*p) || *p == '>' || (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') {
+ else if (g_ascii_isspace (*p) || *p == '>' ||
+ (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') {
break;
}
else {
@@ -843,22 +869,27 @@ parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t i
rspamd_strlcpy (url_text, c, len + 1);
decode_entitles (url_text, NULL);
- if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 &&
- g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 &&
- g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0 &&
- g_ascii_strncasecmp (url_text, "mailto:", sizeof ("mailto:") - 1) != 0) {
+ if (g_ascii_strncasecmp (url_text, "http://",
+ sizeof ("http://") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "www",
+ sizeof ("www") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "ftp://",
+ sizeof ("ftp://") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "mailto:",
+ sizeof ("mailto:") - 1) != 0) {
return;
}
url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri));
rc = parse_uri (url, url_text, task->task_pool);
- if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) {
+ if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen !=
+ 0) {
/*
* Check for phishing
*/
if ((p = strchr (c, '>')) != NULL && id == Tag_A) {
- p ++;
+ p++;
check_phishing (task, url, p, remain - (p - tag_text), id);
}
if (g_tree_lookup (task->urls, url) == NULL) {
@@ -869,20 +900,28 @@ parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t i
}
gboolean
-add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_text_part *part,
- gchar *tag_text, gsize tag_len, gsize remain, GNode ** cur_level)
+add_html_node (struct rspamd_task *task,
+ rspamd_mempool_t * pool,
+ struct mime_text_part *part,
+ gchar *tag_text,
+ gsize tag_len,
+ gsize remain,
+ GNode ** cur_level)
{
- GNode *new;
- struct html_node *data;
+ GNode *new;
+ struct html_node *data;
if (!tags_sorted) {
- qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+ qsort (tag_defs, G_N_ELEMENTS (
+ tag_defs), sizeof (struct html_tag), tag_cmp);
tags_sorted = 1;
}
if (!entities_sorted) {
- qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp);
+ qsort (entities_defs, G_N_ELEMENTS (
+ entities_defs), sizeof (entity), entity_cmp);
memcpy (entities_defs_num, entities_defs, sizeof (entities_defs));
- qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
+ qsort (entities_defs_num, G_N_ELEMENTS (
+ entities_defs), sizeof (entity), entity_cmp_num);
entities_sorted = 1;
}
@@ -892,19 +931,32 @@ add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_te
new = g_node_new (NULL);
*cur_level = new;
part->html_nodes = new;
- rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, part->html_nodes);
+ rspamd_mempool_add_destructor (pool,
+ (rspamd_mempool_destruct_t) g_node_destroy,
+ part->html_nodes);
/* Call once again with root node */
- return add_html_node (task, pool, part, tag_text, tag_len, remain, cur_level);
+ return add_html_node (task,
+ pool,
+ part,
+ tag_text,
+ tag_len,
+ remain,
+ cur_level);
}
else {
new = construct_html_node (pool, tag_text, tag_len);
if (new == NULL) {
- debug_task ("cannot construct HTML node for text '%*s'", tag_len, tag_text);
+ debug_task ("cannot construct HTML node for text '%*s'",
+ tag_len,
+ tag_text);
return FALSE;
}
data = new->data;
- if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
- parse_tag_url (task, part, data->tag->id, tag_text, tag_len, remain);
+ if (data->tag &&
+ (data->tag->id == Tag_A ||
+ data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
+ parse_tag_url (task, part, data->tag->id, tag_text, tag_len,
+ remain);
}
if (data->flags & FL_CLOSING) {
@@ -914,7 +966,8 @@ add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_te
}
g_node_append (*cur_level, new);
if (!check_balance (new, cur_level)) {
- debug_task ("mark part as unbalanced as it has not pairable closing tags");
+ debug_task (
+ "mark part as unbalanced as it has not pairable closing tags");
part->is_balanced = FALSE;
}
}
@@ -926,9 +979,9 @@ add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_te
}
/* Skip some tags */
if (data->tag && (data->tag->id == Tag_STYLE ||
- data->tag->id == Tag_SCRIPT ||
- data->tag->id == Tag_OBJECT ||
- data->tag->id == Tag_TITLE)) {
+ data->tag->id == Tag_SCRIPT ||
+ data->tag->id == Tag_OBJECT ||
+ data->tag->id == Tag_TITLE)) {
return FALSE;
}
}