1 files changed, 5 insertions, 264 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 70885a36d..f48151d05 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -53,261 +53,6 @@ rspamd_message_quark (void)
 	return g_quark_from_static_string ("mime-error");
 }
 
-GByteArray *
-strip_html_tags (struct rspamd_task *task,
-	rspamd_mempool_t * pool,
-	struct mime_text_part *part,
-	GByteArray * src,
-	gint *stateptr)
-{
-	uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart = NULL;
-	gint br, i = 0, depth = 0, in_q = 0;
-	gint state = 0;
-	guint dlen;
-	GByteArray *buf;
-	GNode *level_ptr = NULL;
-	gboolean erase = FALSE, html_decode = FALSE;
-
-	if (stateptr)
-		state = *stateptr;
-
-	buf = g_byte_array_sized_new (src->len);
-	g_byte_array_append (buf, src->data, src->len);
-
-	c = *src->data;
-	lc = '\0';
-	p = src->data;
-	rp = buf->data;
-	end = src->data + src->len;
-	br = 0;
-
-	while (i < (gint)src->len) {
-		switch (c) {
-		case '\0':
-			break;
-		case '<':
-			if (g_ascii_isspace (*(p + 1))) {
-				goto reg_char;
-			}
-			if (state == 0) {
-				lc = '<';
-				tbegin = p + 1;
-				state = 1;
-			}
-			else if (state == 1) {
-				/* Opening bracket without closing one */
-				p--;
-				while (g_ascii_isspace (*p) && p > src->data) {
-					p--;
-				}
-				p++;
-				goto unbreak_tag;
-			}
-			break;
-
-		case '(':
-			if (state == 2) {
-				if (lc != '"' && lc != '\'') {
-					lc = '(';
-					br++;
-				}
-			}
-			else if (state == 0 && !erase) {
-				*(rp++) = c;
-			}
-			break;
-
-		case ')':
-			if (state == 2) {
-				if (lc != '"' && lc != '\'') {
-					lc = ')';
-					br--;
-				}
-			}
-			else if (state == 0 && !erase) {
-				*(rp++) = c;
-			}
-			break;
-
-		case '>':
-			if (depth) {
-				depth--;
-				break;
-			}
-
-			if (in_q) {
-				break;
-			}
-unbreak_tag:
-			switch (state) {
-			case 1:         /* HTML/XML */
-				lc = '>';
-				in_q = state = 0;
-				erase = !add_html_node (task,
-						pool,
-						part,
-						tbegin,
-						p - tbegin,
-						end - tbegin,
-						&level_ptr);
-				break;
-
-			case 2:         /* PHP */
-				if (!br && lc != '\"' && *(p - 1) == '?') {
-					in_q = state = 0;
-				}
-				break;
-
-			case 3:
-				in_q = state = 0;
-				break;
-
-			case 4:         /* JavaScript/CSS/etc... */
-				if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
-					in_q = state = 0;
-				}
-				break;
-
-			default:
-				if (!erase) {
-					*(rp++) = c;
-				}
-				break;
-			}
-			break;
-
-		case '"':
-		case '\'':
-			if (state == 2 && *(p - 1) != '\\') {
-				if (lc == c) {
-					lc = '\0';
-				}
-				else if (lc != '\\') {
-					lc = c;
-				}
-			}
-			else if (state == 0 && !erase) {
-				*(rp++) = c;
-			}
-			if (state && p != src->data && *(p - 1) != '\\' &&
-				(!in_q || *p == in_q)) {
-				if (in_q) {
-					in_q = 0;
-				}
-				else {
-					in_q = *p;
-				}
-			}
-			break;
-
-		case '!':
-			/* JavaScript & Other HTML scripting languages */
-			if (state == 1 && *(p - 1) == '<') {
-				state = 3;
-				lc = c;
-			}
-			else {
-				if (state == 0 && !erase) {
-					*(rp++) = c;
-				}
-			}
-			break;
-
-		case '-':
-			if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' &&
-				*(p - 2) == '!') {
-				state = 4;
-			}
-			else {
-				goto reg_char;
-			}
-			break;
-
-		case '&':
-			/* Decode entitle */
-			html_decode = TRUE;
-			estart = rp;
-			goto reg_char;
-			break;
-
-		case ';':
-			if (html_decode) {
-				html_decode = FALSE;
-				*rp = ';';
-				if (rp - estart > 0) {
-					dlen = rp - estart + 1;
-					rspamd_html_decode_entitles_inplace (estart, &dlen);
-					rp = estart + dlen;
-				}
-			}
-			break;
-
-		case '?':
-
-			if (state == 1 && *(p - 1) == '<') {
-				br = 0;
-				state = 2;
-				break;
-			}
-		case 'E':
-		case 'e':
-			/* !DOCTYPE exception */
-			if (state == 3 && p > src->data + 6
-				&& g_ascii_tolower (*(p - 1)) == 'p'
-				&& g_ascii_tolower (*(p - 2)) == 'y'
-				&& g_ascii_tolower (*(p - 3)) == 't' &&
-				g_ascii_tolower (*(p - 4)) == 'c' &&
-				g_ascii_tolower (*(p - 5)) == 'o' &&
-				g_ascii_tolower (*(p - 6)) == 'd') {
-				state = 1;
-				break;
-			}
-		/* fall-through */
-		case 'l':
-
-			/* swm: If we encounter '<?xml' then we shouldn't be in
-			 * state == 2 (PHP). Switch back to HTML.
-			 */
-
-			if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' &&
-				*(p - 2) == 'x') {
-				state = 1;
-				break;
-			}
-
-		/* fall-through */
-		default:
-reg_char:
-			if (state == 0 && !erase) {
-				*(rp++) = c;
-			}
-			break;
-		}
-		i++;
-		if (i < (gint)src->len) {
-			c = *(++p);
-		}
-	}
-	if (rp < buf->data + src->len) {
-		*rp = '\0';
-		g_byte_array_set_size (buf, rp - buf->data);
-	}
-
-	/* Check tag balancing */
-	if (level_ptr && level_ptr->data != NULL) {
-		part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED;
-	}
-	else {
-		part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
-	}
-
-	if (stateptr) {
-		*stateptr = state;
-	}
-
-	return buf;
-}
-
 static void
 parse_qmail_recv (rspamd_mempool_t * pool,
 	gchar *line,
@@ -1386,21 +1131,17 @@ process_text_part (struct rspamd_task *task,
 				text_part->orig,
 				type,
 				text_part);
-		text_part->html_nodes = NULL;
+		text_part->html = rspamd_mempool_alloc (task->task_pool,
+				sizeof (*text_part->html));
 		text_part->parent = parent;
 		text_part->mime_part = mime_part;
 
 		text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
-		text_part->content = strip_html_tags (task,
+		text_part->content = rspamd_html_process_part (
 				task->task_pool,
-				text_part,
-				part_content,
-				NULL);
+				text_part->html,
+				part_content);
 
-		if (text_part->html_nodes != NULL) {
-			rspamd_html_decode_entitles_inplace (text_part->content->data,
-				&text_part->content->len);
-		}
 		rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
 
 		rspamd_mempool_add_destructor (task->task_pool,