diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-16 09:36:06 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-16 09:36:06 +0100 |
commit | c81b5b124a6ce760894b46f254d64739175b9590 (patch) | |
tree | 067dcefb2d9c088b1c24db569fb1be3592a970a2 | |
parent | 757d70003805aebf8bff4acdd9c1a8a273065649 (diff) | |
download | rspamd-c81b5b124a6ce760894b46f254d64739175b9590.tar.gz rspamd-c81b5b124a6ce760894b46f254d64739175b9590.zip |
Use new HTML API in message.c
-rw-r--r-- | src/libmime/message.c | 269 | ||||
-rw-r--r-- | src/libserver/html.c | 4 | ||||
-rw-r--r-- | src/libserver/html.h | 2 |
3 files changed, 8 insertions, 267 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index 70885a36d..f48151d05 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -53,261 +53,6 @@ rspamd_message_quark (void) return g_quark_from_static_string ("mime-error"); } -GByteArray * -strip_html_tags (struct rspamd_task *task, - rspamd_mempool_t * pool, - struct mime_text_part *part, - GByteArray * src, - gint *stateptr) -{ - uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart = NULL; - gint br, i = 0, depth = 0, in_q = 0; - gint state = 0; - guint dlen; - GByteArray *buf; - GNode *level_ptr = NULL; - gboolean erase = FALSE, html_decode = FALSE; - - if (stateptr) - state = *stateptr; - - buf = g_byte_array_sized_new (src->len); - g_byte_array_append (buf, src->data, src->len); - - c = *src->data; - lc = '\0'; - p = src->data; - rp = buf->data; - end = src->data + src->len; - br = 0; - - while (i < (gint)src->len) { - switch (c) { - case '\0': - break; - case '<': - if (g_ascii_isspace (*(p + 1))) { - goto reg_char; - } - if (state == 0) { - lc = '<'; - tbegin = p + 1; - state = 1; - } - else if (state == 1) { - /* Opening bracket without closing one */ - p--; - while (g_ascii_isspace (*p) && p > src->data) { - p--; - } - p++; - goto unbreak_tag; - } - break; - - case '(': - if (state == 2) { - if (lc != '"' && lc != '\'') { - lc = '('; - br++; - } - } - else if (state == 0 && !erase) { - *(rp++) = c; - } - break; - - case ')': - if (state == 2) { - if (lc != '"' && lc != '\'') { - lc = ')'; - br--; - } - } - else if (state == 0 && !erase) { - *(rp++) = c; - } - break; - - case '>': - if (depth) { - depth--; - break; - } - - if (in_q) { - break; - } -unbreak_tag: - switch (state) { - case 1: /* HTML/XML */ - lc = '>'; - in_q = state = 0; - erase = !add_html_node (task, - pool, - part, - tbegin, - p - tbegin, - end - tbegin, - &level_ptr); - break; - - case 2: /* PHP */ - if (!br && lc != '\"' && *(p - 1) == '?') { - in_q = state = 0; - } - break; - - case 3: - in_q = state = 0; - break; - - case 4: /* JavaScript/CSS/etc... */ - if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') { - in_q = state = 0; - } - break; - - default: - if (!erase) { - *(rp++) = c; - } - break; - } - break; - - case '"': - case '\'': - if (state == 2 && *(p - 1) != '\\') { - if (lc == c) { - lc = '\0'; - } - else if (lc != '\\') { - lc = c; - } - } - else if (state == 0 && !erase) { - *(rp++) = c; - } - if (state && p != src->data && *(p - 1) != '\\' && - (!in_q || *p == in_q)) { - if (in_q) { - in_q = 0; - } - else { - in_q = *p; - } - } - break; - - case '!': - /* JavaScript & Other HTML scripting languages */ - if (state == 1 && *(p - 1) == '<') { - state = 3; - lc = c; - } - else { - if (state == 0 && !erase) { - *(rp++) = c; - } - } - break; - - case '-': - if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' && - *(p - 2) == '!') { - state = 4; - } - else { - goto reg_char; - } - break; - - case '&': - /* Decode entitle */ - html_decode = TRUE; - estart = rp; - goto reg_char; - break; - - case ';': - if (html_decode) { - html_decode = FALSE; - *rp = ';'; - if (rp - estart > 0) { - dlen = rp - estart + 1; - rspamd_html_decode_entitles_inplace (estart, &dlen); - rp = estart + dlen; - } - } - break; - - case '?': - - if (state == 1 && *(p - 1) == '<') { - br = 0; - state = 2; - break; - } - case 'E': - case 'e': - /* !DOCTYPE exception */ - if (state == 3 && p > src->data + 6 - && g_ascii_tolower (*(p - 1)) == 'p' - && g_ascii_tolower (*(p - 2)) == 'y' - && g_ascii_tolower (*(p - 3)) == 't' && - g_ascii_tolower (*(p - 4)) == 'c' && - g_ascii_tolower (*(p - 5)) == 'o' && - g_ascii_tolower (*(p - 6)) == 'd') { - state = 1; - break; - } - /* fall-through */ - case 'l': - - /* swm: If we encounter '<?xml' then we shouldn't be in - * state == 2 (PHP). Switch back to HTML. - */ - - if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' && - *(p - 2) == 'x') { - state = 1; - break; - } - - /* fall-through */ - default: -reg_char: - if (state == 0 && !erase) { - *(rp++) = c; - } - break; - } - i++; - if (i < (gint)src->len) { - c = *(++p); - } - } - if (rp < buf->data + src->len) { - *rp = '\0'; - g_byte_array_set_size (buf, rp - buf->data); - } - - /* Check tag balancing */ - if (level_ptr && level_ptr->data != NULL) { - part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED; - } - else { - part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED; - } - - if (stateptr) { - *stateptr = state; - } - - return buf; -} - static void parse_qmail_recv (rspamd_mempool_t * pool, gchar *line, @@ -1386,21 +1131,17 @@ process_text_part (struct rspamd_task *task, text_part->orig, type, text_part); - text_part->html_nodes = NULL; + text_part->html = rspamd_mempool_alloc (task->task_pool, + sizeof (*text_part->html)); text_part->parent = parent; text_part->mime_part = mime_part; text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED; - text_part->content = strip_html_tags (task, + text_part->content = rspamd_html_process_part ( task->task_pool, - text_part, - part_content, - NULL); + text_part->html, + part_content); - if (text_part->html_nodes != NULL) { - rspamd_html_decode_entitles_inplace (text_part->content->data, - &text_part->content->len); - } rspamd_url_text_extract (task->task_pool, task, text_part, TRUE); rspamd_mempool_add_destructor (task->task_pool, diff --git a/src/libserver/html.c b/src/libserver/html.c index 6af61a199..5bf042153 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1365,7 +1365,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, *statep = state; } -gboolean +GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in) { @@ -1635,5 +1635,5 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, } } - return TRUE; + return dest; } diff --git a/src/libserver/html.h b/src/libserver/html.h index 0f1a95dc7..3095813c2 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -63,7 +63,7 @@ struct html_tag * get_tag_by_name (const gchar *name); */ guint rspamd_html_decode_entitles_inplace (gchar *s, guint len); -gboolean rspamd_html_process_part (rspamd_mempool_t *pool, +GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in); |