aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 09:36:06 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 09:36:06 +0100
commitc81b5b124a6ce760894b46f254d64739175b9590 (patch)
tree067dcefb2d9c088b1c24db569fb1be3592a970a2
parent757d70003805aebf8bff4acdd9c1a8a273065649 (diff)
downloadrspamd-c81b5b124a6ce760894b46f254d64739175b9590.tar.gz
rspamd-c81b5b124a6ce760894b46f254d64739175b9590.zip
Use new HTML API in message.c
-rw-r--r--src/libmime/message.c269
-rw-r--r--src/libserver/html.c4
-rw-r--r--src/libserver/html.h2
3 files changed, 8 insertions, 267 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 70885a36d..f48151d05 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -53,261 +53,6 @@ rspamd_message_quark (void)
return g_quark_from_static_string ("mime-error");
}
-GByteArray *
-strip_html_tags (struct rspamd_task *task,
- rspamd_mempool_t * pool,
- struct mime_text_part *part,
- GByteArray * src,
- gint *stateptr)
-{
- uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart = NULL;
- gint br, i = 0, depth = 0, in_q = 0;
- gint state = 0;
- guint dlen;
- GByteArray *buf;
- GNode *level_ptr = NULL;
- gboolean erase = FALSE, html_decode = FALSE;
-
- if (stateptr)
- state = *stateptr;
-
- buf = g_byte_array_sized_new (src->len);
- g_byte_array_append (buf, src->data, src->len);
-
- c = *src->data;
- lc = '\0';
- p = src->data;
- rp = buf->data;
- end = src->data + src->len;
- br = 0;
-
- while (i < (gint)src->len) {
- switch (c) {
- case '\0':
- break;
- case '<':
- if (g_ascii_isspace (*(p + 1))) {
- goto reg_char;
- }
- if (state == 0) {
- lc = '<';
- tbegin = p + 1;
- state = 1;
- }
- else if (state == 1) {
- /* Opening bracket without closing one */
- p--;
- while (g_ascii_isspace (*p) && p > src->data) {
- p--;
- }
- p++;
- goto unbreak_tag;
- }
- break;
-
- case '(':
- if (state == 2) {
- if (lc != '"' && lc != '\'') {
- lc = '(';
- br++;
- }
- }
- else if (state == 0 && !erase) {
- *(rp++) = c;
- }
- break;
-
- case ')':
- if (state == 2) {
- if (lc != '"' && lc != '\'') {
- lc = ')';
- br--;
- }
- }
- else if (state == 0 && !erase) {
- *(rp++) = c;
- }
- break;
-
- case '>':
- if (depth) {
- depth--;
- break;
- }
-
- if (in_q) {
- break;
- }
-unbreak_tag:
- switch (state) {
- case 1: /* HTML/XML */
- lc = '>';
- in_q = state = 0;
- erase = !add_html_node (task,
- pool,
- part,
- tbegin,
- p - tbegin,
- end - tbegin,
- &level_ptr);
- break;
-
- case 2: /* PHP */
- if (!br && lc != '\"' && *(p - 1) == '?') {
- in_q = state = 0;
- }
- break;
-
- case 3:
- in_q = state = 0;
- break;
-
- case 4: /* JavaScript/CSS/etc... */
- if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
- in_q = state = 0;
- }
- break;
-
- default:
- if (!erase) {
- *(rp++) = c;
- }
- break;
- }
- break;
-
- case '"':
- case '\'':
- if (state == 2 && *(p - 1) != '\\') {
- if (lc == c) {
- lc = '\0';
- }
- else if (lc != '\\') {
- lc = c;
- }
- }
- else if (state == 0 && !erase) {
- *(rp++) = c;
- }
- if (state && p != src->data && *(p - 1) != '\\' &&
- (!in_q || *p == in_q)) {
- if (in_q) {
- in_q = 0;
- }
- else {
- in_q = *p;
- }
- }
- break;
-
- case '!':
- /* JavaScript & Other HTML scripting languages */
- if (state == 1 && *(p - 1) == '<') {
- state = 3;
- lc = c;
- }
- else {
- if (state == 0 && !erase) {
- *(rp++) = c;
- }
- }
- break;
-
- case '-':
- if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' &&
- *(p - 2) == '!') {
- state = 4;
- }
- else {
- goto reg_char;
- }
- break;
-
- case '&':
- /* Decode entitle */
- html_decode = TRUE;
- estart = rp;
- goto reg_char;
- break;
-
- case ';':
- if (html_decode) {
- html_decode = FALSE;
- *rp = ';';
- if (rp - estart > 0) {
- dlen = rp - estart + 1;
- rspamd_html_decode_entitles_inplace (estart, &dlen);
- rp = estart + dlen;
- }
- }
- break;
-
- case '?':
-
- if (state == 1 && *(p - 1) == '<') {
- br = 0;
- state = 2;
- break;
- }
- case 'E':
- case 'e':
- /* !DOCTYPE exception */
- if (state == 3 && p > src->data + 6
- && g_ascii_tolower (*(p - 1)) == 'p'
- && g_ascii_tolower (*(p - 2)) == 'y'
- && g_ascii_tolower (*(p - 3)) == 't' &&
- g_ascii_tolower (*(p - 4)) == 'c' &&
- g_ascii_tolower (*(p - 5)) == 'o' &&
- g_ascii_tolower (*(p - 6)) == 'd') {
- state = 1;
- break;
- }
- /* fall-through */
- case 'l':
-
- /* swm: If we encounter '<?xml' then we shouldn't be in
- * state == 2 (PHP). Switch back to HTML.
- */
-
- if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' &&
- *(p - 2) == 'x') {
- state = 1;
- break;
- }
-
- /* fall-through */
- default:
-reg_char:
- if (state == 0 && !erase) {
- *(rp++) = c;
- }
- break;
- }
- i++;
- if (i < (gint)src->len) {
- c = *(++p);
- }
- }
- if (rp < buf->data + src->len) {
- *rp = '\0';
- g_byte_array_set_size (buf, rp - buf->data);
- }
-
- /* Check tag balancing */
- if (level_ptr && level_ptr->data != NULL) {
- part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED;
- }
- else {
- part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
- }
-
- if (stateptr) {
- *stateptr = state;
- }
-
- return buf;
-}
-
static void
parse_qmail_recv (rspamd_mempool_t * pool,
gchar *line,
@@ -1386,21 +1131,17 @@ process_text_part (struct rspamd_task *task,
text_part->orig,
type,
text_part);
- text_part->html_nodes = NULL;
+ text_part->html = rspamd_mempool_alloc (task->task_pool,
+ sizeof (*text_part->html));
text_part->parent = parent;
text_part->mime_part = mime_part;
text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
- text_part->content = strip_html_tags (task,
+ text_part->content = rspamd_html_process_part (
task->task_pool,
- text_part,
- part_content,
- NULL);
+ text_part->html,
+ part_content);
- if (text_part->html_nodes != NULL) {
- rspamd_html_decode_entitles_inplace (text_part->content->data,
- &text_part->content->len);
- }
rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
rspamd_mempool_add_destructor (task->task_pool,
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 6af61a199..5bf042153 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1365,7 +1365,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
*statep = state;
}
-gboolean
+GByteArray*
rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
GByteArray *in)
{
@@ -1635,5 +1635,5 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
}
}
- return TRUE;
+ return dest;
}
diff --git a/src/libserver/html.h b/src/libserver/html.h
index 0f1a95dc7..3095813c2 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -63,7 +63,7 @@ struct html_tag * get_tag_by_name (const gchar *name);
*/
guint rspamd_html_decode_entitles_inplace (gchar *s, guint len);
-gboolean rspamd_html_process_part (rspamd_mempool_t *pool,
+GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in);