diff options
Diffstat (limited to 'src/libmime/message.c')
-rw-r--r-- | src/libmime/message.c | 269 |
1 files changed, 5 insertions, 264 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index 70885a36d..f48151d05 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -53,261 +53,6 @@ rspamd_message_quark (void) return g_quark_from_static_string ("mime-error"); } -GByteArray * -strip_html_tags (struct rspamd_task *task, - rspamd_mempool_t * pool, - struct mime_text_part *part, - GByteArray * src, - gint *stateptr) -{ - uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart = NULL; - gint br, i = 0, depth = 0, in_q = 0; - gint state = 0; - guint dlen; - GByteArray *buf; - GNode *level_ptr = NULL; - gboolean erase = FALSE, html_decode = FALSE; - - if (stateptr) - state = *stateptr; - - buf = g_byte_array_sized_new (src->len); - g_byte_array_append (buf, src->data, src->len); - - c = *src->data; - lc = '\0'; - p = src->data; - rp = buf->data; - end = src->data + src->len; - br = 0; - - while (i < (gint)src->len) { - switch (c) { - case '\0': - break; - case '<': - if (g_ascii_isspace (*(p + 1))) { - goto reg_char; - } - if (state == 0) { - lc = '<'; - tbegin = p + 1; - state = 1; - } - else if (state == 1) { - /* Opening bracket without closing one */ - p--; - while (g_ascii_isspace (*p) && p > src->data) { - p--; - } - p++; - goto unbreak_tag; - } - break; - - case '(': - if (state == 2) { - if (lc != '"' && lc != '\'') { - lc = '('; - br++; - } - } - else if (state == 0 && !erase) { - *(rp++) = c; - } - break; - - case ')': - if (state == 2) { - if (lc != '"' && lc != '\'') { - lc = ')'; - br--; - } - } - else if (state == 0 && !erase) { - *(rp++) = c; - } - break; - - case '>': - if (depth) { - depth--; - break; - } - - if (in_q) { - break; - } -unbreak_tag: - switch (state) { - case 1: /* HTML/XML */ - lc = '>'; - in_q = state = 0; - erase = !add_html_node (task, - pool, - part, - tbegin, - p - tbegin, - end - tbegin, - &level_ptr); - break; - - case 2: /* PHP */ - if (!br && lc != '\"' && *(p - 1) == '?') { - in_q = state = 0; - } - break; - - case 3: - in_q = state = 0; - break; - - case 4: /* JavaScript/CSS/etc... */ - if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') { - in_q = state = 0; - } - break; - - default: - if (!erase) { - *(rp++) = c; - } - break; - } - break; - - case '"': - case '\'': - if (state == 2 && *(p - 1) != '\\') { - if (lc == c) { - lc = '\0'; - } - else if (lc != '\\') { - lc = c; - } - } - else if (state == 0 && !erase) { - *(rp++) = c; - } - if (state && p != src->data && *(p - 1) != '\\' && - (!in_q || *p == in_q)) { - if (in_q) { - in_q = 0; - } - else { - in_q = *p; - } - } - break; - - case '!': - /* JavaScript & Other HTML scripting languages */ - if (state == 1 && *(p - 1) == '<') { - state = 3; - lc = c; - } - else { - if (state == 0 && !erase) { - *(rp++) = c; - } - } - break; - - case '-': - if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' && - *(p - 2) == '!') { - state = 4; - } - else { - goto reg_char; - } - break; - - case '&': - /* Decode entitle */ - html_decode = TRUE; - estart = rp; - goto reg_char; - break; - - case ';': - if (html_decode) { - html_decode = FALSE; - *rp = ';'; - if (rp - estart > 0) { - dlen = rp - estart + 1; - rspamd_html_decode_entitles_inplace (estart, &dlen); - rp = estart + dlen; - } - } - break; - - case '?': - - if (state == 1 && *(p - 1) == '<') { - br = 0; - state = 2; - break; - } - case 'E': - case 'e': - /* !DOCTYPE exception */ - if (state == 3 && p > src->data + 6 - && g_ascii_tolower (*(p - 1)) == 'p' - && g_ascii_tolower (*(p - 2)) == 'y' - && g_ascii_tolower (*(p - 3)) == 't' && - g_ascii_tolower (*(p - 4)) == 'c' && - g_ascii_tolower (*(p - 5)) == 'o' && - g_ascii_tolower (*(p - 6)) == 'd') { - state = 1; - break; - } - /* fall-through */ - case 'l': - - /* swm: If we encounter '<?xml' then we shouldn't be in - * state == 2 (PHP). Switch back to HTML. - */ - - if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' && - *(p - 2) == 'x') { - state = 1; - break; - } - - /* fall-through */ - default: -reg_char: - if (state == 0 && !erase) { - *(rp++) = c; - } - break; - } - i++; - if (i < (gint)src->len) { - c = *(++p); - } - } - if (rp < buf->data + src->len) { - *rp = '\0'; - g_byte_array_set_size (buf, rp - buf->data); - } - - /* Check tag balancing */ - if (level_ptr && level_ptr->data != NULL) { - part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED; - } - else { - part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED; - } - - if (stateptr) { - *stateptr = state; - } - - return buf; -} - static void parse_qmail_recv (rspamd_mempool_t * pool, gchar *line, @@ -1386,21 +1131,17 @@ process_text_part (struct rspamd_task *task, text_part->orig, type, text_part); - text_part->html_nodes = NULL; + text_part->html = rspamd_mempool_alloc (task->task_pool, + sizeof (*text_part->html)); text_part->parent = parent; text_part->mime_part = mime_part; text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED; - text_part->content = strip_html_tags (task, + text_part->content = rspamd_html_process_part ( task->task_pool, - text_part, - part_content, - NULL); + text_part->html, + part_content); - if (text_part->html_nodes != NULL) { - rspamd_html_decode_entitles_inplace (text_part->content->data, - &text_part->content->len); - } rspamd_url_text_extract (task->task_pool, task, text_part, TRUE); rspamd_mempool_add_destructor (task->task_pool, |