diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-05-15 18:15:54 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-05-15 18:15:54 +0400 |
commit | 8647250389da44e3cec0f9f7c0c2e4c47c93195c (patch) | |
tree | 4e34957983d08c3f8d7ba41e23770ed09c39aaf5 /src/message.c | |
parent | 784dbf335644c385fb0f3a1fae70e3886f3b6f6e (diff) | |
download | rspamd-8647250389da44e3cec0f9f7c0c2e4c47c93195c.tar.gz rspamd-8647250389da44e3cec0f9f7c0c2e4c47c93195c.zip |
* Add simple html parser and tag balancing detector
* Add function for searching html tag
Diffstat (limited to 'src/message.c')
-rw-r--r-- | src/message.c | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/src/message.c b/src/message.c index 5d344db62..3024377d5 100644 --- a/src/message.c +++ b/src/message.c @@ -27,15 +27,17 @@ #include "main.h" #include "message.h" #include "cfg_file.h" +#include "html.h" #include "modules.h" GByteArray* -strip_html_tags (GByteArray *src, int *stateptr) +strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr) { - uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc; + uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin, c, lc; int br, i = 0, depth = 0, in_q = 0; int state = 0; GByteArray *buf; + GNode *level_ptr = NULL; if (stateptr) state = *stateptr; @@ -59,6 +61,7 @@ strip_html_tags (GByteArray *src, int *stateptr) } if (state == 0) { lc = '<'; + tbegin = p + 1; state = 1; } else if (state == 1) { depth++; @@ -101,7 +104,9 @@ strip_html_tags (GByteArray *src, int *stateptr) case 1: /* HTML/XML */ lc = '>'; in_q = state = 0; - + *p = '\0'; + add_html_node (pool, part, tbegin, &level_ptr); + *p = '>'; break; case 2: /* PHP */ @@ -220,9 +225,15 @@ reg_char: *rp = '\0'; g_byte_array_set_size (buf, rp - buf->data); } + + /* Check tag balancing */ + if (level_ptr && level_ptr->data != NULL) { + part->is_balanced = FALSE; + } - if (stateptr) + if (stateptr) { *stateptr = state; + } return buf; } @@ -287,8 +298,10 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); - text_part->content = strip_html_tags (part_content, NULL); text_part->is_html = TRUE; + text_part->is_balanced = TRUE; + text_part->html_nodes = NULL; + text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL); text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); task->text_parts = g_list_prepend (task->text_parts, text_part); @@ -591,10 +604,12 @@ process_learn (struct controller_session *session) return 0; } +/* + * XXX: remove this function for learning + */ GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) { - GByteArray *ret = NULL; struct mime_part *p; if (*cur == NULL) { @@ -611,6 +626,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) msg_debug ("get_next_text_part: text/plain part"); return p->content; } +#if 0 else if (g_mime_content_type_is_type (p->type, "text", "html")) { msg_debug ("get_next_text_part: try to strip html tags"); ret = strip_html_tags (p->content, NULL); @@ -623,6 +639,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret); return ret; } +#endif *cur = g_list_next (*cur); } |