From 8647250389da44e3cec0f9f7c0c2e4c47c93195c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 15 May 2009 18:15:54 +0400 Subject: [PATCH] * Add simple html parser and tag balancing detector * Add function for searching html tag --- CMakeLists.txt | 5 +- README.utf8.txt | 2 + src/expressions.c | 91 +++++++++++++ src/html.c | 319 ++++++++++++++++++++++++++++++++++++++++++++++ src/html.h | 210 ++++++++++++++++++++++++++++++ src/message.c | 29 ++++- src/message.h | 2 + 7 files changed, 651 insertions(+), 7 deletions(-) create mode 100644 src/html.c create mode 100644 src/html.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e346370d1..f744fff03 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ PROJECT(rspamd C) SET(RSPAMD_VERSION_MAJOR 0) SET(RSPAMD_VERSION_MINOR 1) -SET(RSPAMD_VERSION_PATCH 1) +SET(RSPAMD_VERSION_PATCH 2) SET(RSPAMD_VERSION "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}") SET(RSPAMD_MASTER_SITE_URL "http://cebka.pp.ru/hg/rspamd") @@ -304,6 +304,7 @@ SET(RSPAMDSRC src/modules.c src/controller.c src/cfg_utils.c src/buffer.c + src/html.c src/lmtp.c src/lmtp_proto.c) @@ -339,6 +340,7 @@ SET(TESTDEPENDS src/mem_pool.c src/fuzzy.c src/memcached.c src/message.c + src/html.c src/expressions.c src/statfile.c) @@ -351,6 +353,7 @@ SET(UTILSDEPENDS src/mem_pool.c src/fuzzy.c src/expressions.c src/message.c + src/html.c src/util.c) LIST(LENGTH PLUGINSSRC RSPAMD_MODULES_NUM) diff --git a/README.utf8.txt b/README.utf8.txt index a52e380f4..b27e2876f 100644 --- a/README.utf8.txt +++ b/README.utf8.txt @@ -156,6 +156,8 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})" * has_only_html_part - функция возвращает TRUE, если в сообщении есть только одна HTML часть * compare_recipients_distance - вычисляет процент схожих получателей письма. Принимает аргумент - порог в процентах похожести. * is_recipients_sorted - возвращает TRUE, если список получателей сортирован (работает только если число получателей >= 5). + * is_html_balanced - возвращает TRUE, если теги всех html частей сбалансированы + * has_html_tag - возвращает TRUE, если заданный html тег найден Модуль chartable. ================ diff --git a/src/expressions.c b/src/expressions.c index c7b88adb9..05bc12e88 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -29,6 +29,7 @@ #include "message.h" #include "fuzzy.h" #include "expressions.h" +#include "html.h" gboolean rspamd_compare_encoding (struct worker_task *task, GList *args); gboolean rspamd_header_exists (struct worker_task *task, GList *args); @@ -43,6 +44,8 @@ gboolean rspamd_has_content_part_len (struct worker_task *task, GList *args); gboolean rspamd_has_only_html_part (struct worker_task *task, GList *args); gboolean rspamd_is_recipients_sorted (struct worker_task *task, GList *args); gboolean rspamd_compare_transfer_encoding (struct worker_task *task, GList *args); +gboolean rspamd_is_html_balanced (struct worker_task *task, GList *args); +gboolean rspamd_has_html_tag (struct worker_task *task, GList *args); /* * List of internal functions of rspamd @@ -62,8 +65,10 @@ static struct _fl { { "content_type_is_type", rspamd_content_type_is_type }, { "has_content_part", rspamd_has_content_part }, { "has_content_part_len", rspamd_has_content_part_len }, + { "has_html_tag", rspamd_has_html_tag }, { "has_only_html_part", rspamd_has_only_html_part }, { "header_exists", rspamd_header_exists }, + { "is_html_balanced", rspamd_is_html_balanced }, { "is_recipients_sorted", rspamd_is_recipients_sorted }, }; @@ -1523,6 +1528,92 @@ rspamd_compare_transfer_encoding (struct worker_task *task, GList *args) return FALSE; } +gboolean +rspamd_is_html_balanced (struct worker_task *task, GList *args) +{ + struct mime_text_part *p; + GList *cur; + gboolean res = TRUE; + + cur = g_list_first (task->text_parts); + while (cur) { + p = cur->data; + if (p->is_html) { + if (p->is_balanced) { + res = TRUE; + } + else { + res = FALSE; + break; + } + } + cur = g_list_next (cur); + } + + return res; + +} + +struct html_callback_data { + struct html_tag *tag; + gboolean *res; +}; + +static gboolean +search_html_node_callback (GNode *node, gpointer data) +{ + struct html_callback_data *cd = data; + struct html_node *nd; + + nd = node->data; + if (nd) { + if (nd->tag == cd->tag) { + *cd->res = TRUE; + return TRUE; + } + } + + return FALSE; +} + +gboolean +rspamd_has_html_tag (struct worker_task *task, GList *args) +{ + struct mime_text_part *p; + GList *cur; + struct expression_argument *arg; + struct html_tag *tag; + gboolean res = FALSE; + struct html_callback_data cd; + + if (args == NULL) { + msg_warn ("rspamd_has_html_tag: no parameters to function"); + return FALSE; + } + + arg = get_function_arg (args->data, task, TRUE); + tag = get_tag_by_name (arg->data); + if (tag == NULL) { + msg_warn ("rspamd_has_html_tag: unknown tag type passed as argument: %s", (char *)arg->data); + return FALSE; + } + + cur = g_list_first (task->text_parts); + cd.res = &res; + cd.tag = tag; + + while (cur && res == FALSE) { + p = cur->data; + if (p->is_html && p->html_nodes) { + g_node_traverse (p->html_nodes, G_PRE_ORDER, G_TRAVERSE_ALL, -1, search_html_node_callback, &cd); + } + cur = g_list_next (cur); + } + + return res; + +} + /* * vi:ts=4 */ diff --git a/src/html.c b/src/html.c new file mode 100644 index 000000000..9a816c4bd --- /dev/null +++ b/src/html.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "util.h" +#include "main.h" +#include "message.h" +#include "html.h" + +sig_atomic_t tags_sorted = 0; + +static struct html_tag tag_defs[] = +{ + /* W3C defined elements */ + { Tag_A, "a", (CM_INLINE)}, + { Tag_ABBR, "abbr", (CM_INLINE)}, + { Tag_ACRONYM, "acronym", (CM_INLINE)}, + { Tag_ADDRESS, "address", (CM_BLOCK)}, + { Tag_APPLET, "applet", (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)}, + { Tag_AREA, "area", (CM_BLOCK|CM_EMPTY)}, + { Tag_B, "b", (CM_INLINE)}, + { Tag_BASE, "base", (CM_HEAD|CM_EMPTY)}, + { Tag_BASEFONT, "basefont", (CM_INLINE|CM_EMPTY)}, + { Tag_BDO, "bdo", (CM_INLINE)}, + { Tag_BIG, "big", (CM_INLINE)}, + { Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)}, + { Tag_BODY, "body", (CM_HTML|CM_OPT|CM_OMITST)}, + { Tag_BR, "br", (CM_INLINE|CM_EMPTY)}, + { Tag_BUTTON, "button", (CM_INLINE)}, + { Tag_CAPTION, "caption", (CM_TABLE)}, + { Tag_CENTER, "center", (CM_BLOCK)}, + { Tag_CITE, "cite", (CM_INLINE)}, + { Tag_CODE, "code", (CM_INLINE)}, + { Tag_COL, "col", (CM_TABLE|CM_EMPTY)}, + { Tag_COLGROUP, "colgroup", (CM_TABLE|CM_OPT)}, + { Tag_DD, "dd", (CM_DEFLIST|CM_OPT|CM_NO_INDENT)}, + { Tag_DEL, "del", (CM_INLINE|CM_BLOCK|CM_MIXED)}, + { Tag_DFN, "dfn", (CM_INLINE)}, + { Tag_DIR, "dir", (CM_BLOCK|CM_OBSOLETE)}, + { Tag_DIV, "div", (CM_BLOCK)}, + { Tag_DL, "dl", (CM_BLOCK)}, + { Tag_DT, "dt", (CM_DEFLIST|CM_OPT|CM_NO_INDENT)}, + { Tag_EM, "em", (CM_INLINE)}, + { Tag_FIELDSET, "fieldset", (CM_BLOCK)}, + { Tag_FONT, "font", (CM_INLINE)}, + { Tag_FORM, "form", (CM_BLOCK)}, + { Tag_FRAME, "frame", (CM_FRAMES|CM_EMPTY)}, + { Tag_FRAMESET, "frameset", (CM_HTML|CM_FRAMES)}, + { Tag_H1, "h1", (CM_BLOCK|CM_HEADING)}, + { Tag_H2, "h2", (CM_BLOCK|CM_HEADING)}, + { Tag_H3, "h3", (CM_BLOCK|CM_HEADING)}, + { Tag_H4, "h4", (CM_BLOCK|CM_HEADING)}, + { Tag_H5, "h5", (CM_BLOCK|CM_HEADING)}, + { Tag_H6, "h6", (CM_BLOCK|CM_HEADING)}, + { Tag_HEAD, "head", (CM_HTML|CM_OPT|CM_OMITST)}, + { Tag_HR, "hr", (CM_BLOCK|CM_EMPTY)}, + { Tag_HTML, "html", (CM_HTML|CM_OPT|CM_OMITST)}, + { Tag_I, "i", (CM_INLINE)}, + { Tag_IFRAME, "iframe", (CM_INLINE)}, + { Tag_IMG, "img", (CM_INLINE|CM_IMG|CM_EMPTY)}, + { Tag_INPUT, "input", (CM_INLINE|CM_IMG|CM_EMPTY)}, + { Tag_INS, "ins", (CM_INLINE|CM_BLOCK|CM_MIXED)}, + { Tag_ISINDEX, "isindex", (CM_BLOCK|CM_EMPTY)}, + { Tag_KBD, "kbd", (CM_INLINE)}, + { Tag_LABEL, "label", (CM_INLINE)}, + { Tag_LEGEND, "legend", (CM_INLINE)}, + { Tag_LI, "li", (CM_LIST|CM_OPT|CM_NO_INDENT)}, + { Tag_LINK, "link", (CM_HEAD|CM_EMPTY)}, + { Tag_LISTING, "listing", (CM_BLOCK|CM_OBSOLETE)}, + { Tag_MAP, "map", (CM_INLINE)}, + { Tag_MENU, "menu", (CM_BLOCK|CM_OBSOLETE)}, + { Tag_META, "meta", (CM_HEAD|CM_EMPTY)}, + { Tag_NOFRAMES, "noframes", (CM_BLOCK|CM_FRAMES)}, + { Tag_NOSCRIPT, "noscript", (CM_BLOCK|CM_INLINE|CM_MIXED)}, + { Tag_OBJECT, "object", (CM_OBJECT|CM_HEAD|CM_IMG|CM_INLINE|CM_PARAM)}, + { Tag_OL, "ol", (CM_BLOCK)}, + { Tag_OPTGROUP, "optgroup", (CM_FIELD|CM_OPT)}, + { Tag_OPTION, "option", (CM_FIELD|CM_OPT)}, + { Tag_P, "p", (CM_BLOCK|CM_OPT)}, + { Tag_PARAM, "param", (CM_INLINE|CM_EMPTY)}, + { Tag_PLAINTEXT, "plaintext", (CM_BLOCK|CM_OBSOLETE)}, + { Tag_PRE, "pre", (CM_BLOCK)}, + { Tag_Q, "q", (CM_INLINE)}, + { Tag_RB, "rb", (CM_INLINE)}, + { Tag_RBC, "rbc", (CM_INLINE)}, + { Tag_RP, "rp", (CM_INLINE)}, + { Tag_RT, "rt", (CM_INLINE)}, + { Tag_RTC, "rtc", (CM_INLINE)}, + { Tag_RUBY, "ruby", (CM_INLINE)}, + { Tag_S, "s", (CM_INLINE)}, + { Tag_SAMP, "samp", (CM_INLINE)}, + { Tag_SCRIPT, "script", (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)}, + { Tag_SELECT, "select", (CM_INLINE|CM_FIELD)}, + { Tag_SMALL, "small", (CM_INLINE)}, + { Tag_SPAN, "span", (CM_INLINE)}, + { Tag_STRIKE, "strike", (CM_INLINE)}, + { Tag_STRONG, "strong", (CM_INLINE)}, + { Tag_STYLE, "style", (CM_HEAD)}, + { Tag_SUB, "sub", (CM_INLINE)}, + { Tag_SUP, "sup", (CM_INLINE)}, + { Tag_TABLE, "table", (CM_BLOCK)}, + { Tag_TBODY, "tbody", (CM_TABLE|CM_ROWGRP|CM_OPT)}, + { Tag_TD, "td", (CM_ROW|CM_OPT|CM_NO_INDENT)}, + { Tag_TEXTAREA, "textarea", (CM_INLINE|CM_FIELD)}, + { Tag_TFOOT, "tfoot", (CM_TABLE|CM_ROWGRP|CM_OPT)}, + { Tag_TH, "th", (CM_ROW|CM_OPT|CM_NO_INDENT)}, + { Tag_THEAD, "thead", (CM_TABLE|CM_ROWGRP|CM_OPT)}, + { Tag_TITLE, "title", (CM_HEAD)}, + { Tag_TR, "tr", (CM_TABLE|CM_OPT)}, + { Tag_TT, "tt", (CM_INLINE)}, + { Tag_U, "u", (CM_INLINE)}, + { Tag_UL, "ul", (CM_BLOCK)}, + { Tag_VAR, "var", (CM_INLINE)}, + { Tag_XMP, "xmp", (CM_BLOCK|CM_OBSOLETE)}, + { Tag_NEXTID, "nextid", (CM_HEAD|CM_EMPTY)}, + + /* proprietary elements */ + { Tag_ALIGN, "align", (CM_BLOCK)}, + { Tag_BGSOUND, "bgsound", (CM_HEAD|CM_EMPTY)}, + { Tag_BLINK, "blink", (CM_INLINE)}, + { Tag_COMMENT, "comment", (CM_INLINE)}, + { Tag_EMBED, "embed", (CM_INLINE|CM_IMG|CM_EMPTY)}, + { Tag_ILAYER, "ilayer", (CM_INLINE)}, + { Tag_KEYGEN, "keygen", (CM_INLINE|CM_EMPTY)}, + { Tag_LAYER, "layer", (CM_BLOCK)}, + { Tag_MARQUEE, "marquee", (CM_INLINE|CM_OPT)}, + { Tag_MULTICOL, "multicol", (CM_BLOCK)}, + { Tag_NOBR, "nobr", (CM_INLINE)}, + { Tag_NOEMBED, "noembed", (CM_INLINE)}, + { Tag_NOLAYER, "nolayer", (CM_BLOCK|CM_INLINE|CM_MIXED)}, + { Tag_NOSAVE, "nosave", (CM_BLOCK)}, + { Tag_SERVER, "server", (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)}, + { Tag_SERVLET, "servlet", (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)}, + { Tag_SPACER, "spacer", (CM_INLINE|CM_EMPTY)}, + { Tag_WBR, "wbr", (CM_INLINE|CM_EMPTY)}, +}; + +static int +tag_cmp (const void *m1, const void *m2) +{ + const struct html_tag *p1 = m1; + const struct html_tag *p2 = m2; + + return g_ascii_strcasecmp (p1->name, p2->name); +} + +static GNode* +construct_html_node (memory_pool_t *pool, char *text) +{ + struct html_node *html; + GNode *n = NULL; + struct html_tag key, *found; + char t; + int taglen = strlen (text); + + if (text == NULL || *text == '\0') { + return NULL; + } + + html = memory_pool_alloc0 (pool, sizeof (struct html_node)); + + /* Check whether this tag is fully closed */ + if (*(text + taglen - 1) == '/') { + html->flags |= FL_CLOSED; + } + + /* Check xml tag */ + if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) { + html->flags |= FL_XML; + html->tag = NULL; + } + else { + if (*text == '/') { + html->flags |= FL_CLOSING; + text ++; + } + + /* Find end of tag name */ + key.name = text; + while (*text && g_ascii_isalnum (*(++text))); + + t = *text; + *text = '\0'; + + /* Match tag id by tag name */ + if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) { + *text = t; + html->tag = found; + } + else { + *text = t; + return NULL; + } + } + + n = g_node_new (html); + + return n; +} + +static gboolean +check_balance (GNode *node, GNode **cur_level) +{ + struct html_node *arg = node->data, *tmp; + GNode *cur; + + if (arg->flags & FL_CLOSING) { + /* First of all check whether this tag is closing tag for parent node */ + cur = node->parent; + while (cur && cur->data) { + tmp = cur->data; + if (tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) { + msg_debug ("check_balance: found closing tag for parent '%s'", tmp->tag->name); + tmp->flags |= FL_CLOSED; + /* Destroy current node as we find corresponding parent node */ + g_node_destroy (node); + /* Change level */ + *cur_level = cur->parent; + return TRUE; + } + cur = cur->parent; + } + } + else { + return TRUE; + } + + msg_debug ("check_balance: found unbalanced tag %s", arg->tag->name); + return FALSE; +} + +struct html_tag * +get_tag_by_name (const char *name) +{ + struct html_tag key; + + key.name = name; + + return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); +} + +gboolean +add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level) +{ + GNode *new; + struct html_node *data; + + if (!tags_sorted) { + qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); + tags_sorted = 1; + } + + /* First call of this function */ + if (part->html_nodes == NULL) { + /* Insert root node */ + new = g_node_new (NULL); + *cur_level = new; + part->html_nodes = new; + memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes); + /* Call once again with root node */ + return add_html_node (pool, part, tag_text, cur_level); + } + else { + new = construct_html_node (pool, tag_text); + if (new == NULL) { + msg_debug ("add_html_node: cannot construct HTML node for text '%s'", tag_text); + return -1; + } + data = new->data; + if (data->flags & FL_CLOSING) { + if (! *cur_level) { + msg_debug ("add_html_node: bad parent node"); + return FALSE; + } + g_node_append (*cur_level, new); + if (!check_balance (new, cur_level)) { + msg_debug ("add_html_node: mark part as unbalanced as it has not pairable closing tags"); + part->is_balanced = FALSE; + } + } + else { + g_node_append (*cur_level, new); + if ((data->flags & FL_CLOSED) == 0) { + msg_debug ("add_html_node: append opening tag: '%s'", data->tag->name); + *cur_level = new; + } + else { + msg_debug ("add_html_node: append closed tag: '%s'", data->tag->name); + } + } + } + + return TRUE; +} + +/* + * vi:ts=4 + */ diff --git a/src/html.h b/src/html.h new file mode 100644 index 000000000..70f20de49 --- /dev/null +++ b/src/html.h @@ -0,0 +1,210 @@ +/* + * Functions for simple html parsing + */ + +#ifndef RSPAMD_HTML_H +#define RSPAMD_HTML_H + +#include "config.h" +#include "mem_pool.h" + +/* Known HTML tags */ +typedef enum +{ + Tag_UNKNOWN, /**< Unknown tag! */ + Tag_A, /**< A */ + Tag_ABBR, /**< ABBR */ + Tag_ACRONYM, /**< ACRONYM */ + Tag_ADDRESS, /**< ADDRESS */ + Tag_ALIGN, /**< ALIGN */ + Tag_APPLET, /**< APPLET */ + Tag_AREA, /**< AREA */ + Tag_B, /**< B */ + Tag_BASE, /**< BASE */ + Tag_BASEFONT, /**< BASEFONT */ + Tag_BDO, /**< BDO */ + Tag_BGSOUND, /**< BGSOUND */ + Tag_BIG, /**< BIG */ + Tag_BLINK, /**< BLINK */ + Tag_BLOCKQUOTE, /**< BLOCKQUOTE */ + Tag_BODY, /**< BODY */ + Tag_BR, /**< BR */ + Tag_BUTTON, /**< BUTTON */ + Tag_CAPTION, /**< CAPTION */ + Tag_CENTER, /**< CENTER */ + Tag_CITE, /**< CITE */ + Tag_CODE, /**< CODE */ + Tag_COL, /**< COL */ + Tag_COLGROUP, /**< COLGROUP */ + Tag_COMMENT, /**< COMMENT */ + Tag_DD, /**< DD */ + Tag_DEL, /**< DEL */ + Tag_DFN, /**< DFN */ + Tag_DIR, /**< DIR */ + Tag_DIV, /**< DIF */ + Tag_DL, /**< DL */ + Tag_DT, /**< DT */ + Tag_EM, /**< EM */ + Tag_EMBED, /**< EMBED */ + Tag_FIELDSET, /**< FIELDSET */ + Tag_FONT, /**< FONT */ + Tag_FORM, /**< FORM */ + Tag_FRAME, /**< FRAME */ + Tag_FRAMESET, /**< FRAMESET */ + Tag_H1, /**< H1 */ + Tag_H2, /**< H2 */ + Tag_H3, /**< H3 */ + Tag_H4, /**< H4 */ + Tag_H5, /**< H5 */ + Tag_H6, /**< H6 */ + Tag_HEAD, /**< HEAD */ + Tag_HR, /**< HR */ + Tag_HTML, /**< HTML */ + Tag_I, /**< I */ + Tag_IFRAME, /**< IFRAME */ + Tag_ILAYER, /**< ILAYER */ + Tag_IMG, /**< IMG */ + Tag_INPUT, /**< INPUT */ + Tag_INS, /**< INS */ + Tag_ISINDEX, /**< ISINDEX */ + Tag_KBD, /**< KBD */ + Tag_KEYGEN, /**< KEYGEN */ + Tag_LABEL, /**< LABEL */ + Tag_LAYER, /**< LAYER */ + Tag_LEGEND, /**< LEGEND */ + Tag_LI, /**< LI */ + Tag_LINK, /**< LINK */ + Tag_LISTING, /**< LISTING */ + Tag_MAP, /**< MAP */ + Tag_MARQUEE, /**< MARQUEE */ + Tag_MENU, /**< MENU */ + Tag_META, /**< META */ + Tag_MULTICOL, /**< MULTICOL */ + Tag_NOBR, /**< NOBR */ + Tag_NOEMBED, /**< NOEMBED */ + Tag_NOFRAMES, /**< NOFRAMES */ + Tag_NOLAYER, /**< NOLAYER */ + Tag_NOSAVE, /**< NOSAVE */ + Tag_NOSCRIPT, /**< NOSCRIPT */ + Tag_OBJECT, /**< OBJECT */ + Tag_OL, /**< OL */ + Tag_OPTGROUP, /**< OPTGROUP */ + Tag_OPTION, /**< OPTION */ + Tag_P, /**< P */ + Tag_PARAM, /**< PARAM */ + Tag_PLAINTEXT,/**< PLAINTEXT */ + Tag_PRE, /**< PRE */ + Tag_Q, /**< Q */ + Tag_RB, /**< RB */ + Tag_RBC, /**< RBC */ + Tag_RP, /**< RP */ + Tag_RT, /**< RT */ + Tag_RTC, /**< RTC */ + Tag_RUBY, /**< RUBY */ + Tag_S, /**< S */ + Tag_SAMP, /**< SAMP */ + Tag_SCRIPT, /**< SCRIPT */ + Tag_SELECT, /**< SELECT */ + Tag_SERVER, /**< SERVER */ + Tag_SERVLET, /**< SERVLET */ + Tag_SMALL, /**< SMALL */ + Tag_SPACER, /**< SPACER */ + Tag_SPAN, /**< SPAN */ + Tag_STRIKE, /**< STRIKE */ + Tag_STRONG, /**< STRONG */ + Tag_STYLE, /**< STYLE */ + Tag_SUB, /**< SUB */ + Tag_SUP, /**< SUP */ + Tag_TABLE, /**< TABLE */ + Tag_TBODY, /**< TBODY */ + Tag_TD, /**< TD */ + Tag_TEXTAREA, /**< TEXTAREA */ + Tag_TFOOT, /**< TFOOT */ + Tag_TH, /**< TH */ + Tag_THEAD, /**< THEAD */ + Tag_TITLE, /**< TITLE */ + Tag_TR, /**< TR */ + Tag_TT, /**< TT */ + Tag_U, /**< U */ + Tag_UL, /**< UL */ + Tag_VAR, /**< VAR */ + Tag_WBR, /**< WBR */ + Tag_XMP, /**< XMP */ + Tag_XML, /**< XML */ + Tag_NEXTID, /**< NEXTID */ + + N_TAGS /**< Must be last */ +} tag_id_t; + +#define CM_UNKNOWN 0 +/* Elements with no content. Map to HTML specification. */ +#define CM_EMPTY (1 << 0) +/* Elements that appear outside of "BODY". */ +#define CM_HTML (1 << 1) +/* Elements that can appear within HEAD. */ +#define CM_HEAD (1 << 2) +/* HTML "block" elements. */ +#define CM_BLOCK (1 << 3) +/* HTML "inline" elements. */ +#define CM_INLINE (1 << 4) +/* Elements that mark list item ("LI"). */ +#define CM_LIST (1 << 5) +/* Elements that mark definition list item ("DL", "DT"). */ +#define CM_DEFLIST (1 << 6) +/* Elements that can appear inside TABLE. */ +#define CM_TABLE (1 << 7) +/* Used for "THEAD", "TFOOT" or "TBODY". */ +#define CM_ROWGRP (1 << 8) +/* Used for "TD", "TH" */ +#define CM_ROW (1 << 9) +/* Elements whose content must be protected against white space movement. + Includes some elements that can found in forms. */ +#define CM_FIELD (1 << 10) +/* Used to avoid propagating inline emphasis inside some elements + such as OBJECT or APPLET. */ +#define CM_OBJECT (1 << 11) +/* Elements that allows "PARAM". */ +#define CM_PARAM (1 << 12) +/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ +#define CM_FRAMES (1 << 13) +/* Heading elements (h1, h2, ...). */ +#define CM_HEADING (1 << 14) +/* Elements with an optional end tag. */ +#define CM_OPT (1 << 15) +/* Elements that use "align" attribute for vertical position. */ +#define CM_IMG (1 << 16) +/* Elements with inline and block model. Used to avoid calling InlineDup. */ +#define CM_MIXED (1 << 17) +/* Elements whose content needs to be indented only if containing one + CM_BLOCK element. */ +#define CM_NO_INDENT (1 << 18) +/* Elements that are obsolete (such as "dir", "menu"). */ +#define CM_OBSOLETE (1 << 19) +/* User defined elements. Used to determine how attributes wihout value + should be printed. */ +#define CM_NEW (1 << 20) +/* Elements that cannot be omitted. */ +#define CM_OMITST (1 << 21) + +/* XML tag */ +#define FL_XML (1 << 0) +/* Closing tag */ +#define FL_CLOSING (1 << 1) +/* Fully closed tag (e.g. ) */ +#define FL_CLOSED (1 << 2) + +struct html_tag { + tag_id_t id; + const char *name; + int flags; +}; + +struct html_node { + struct html_tag *tag; + int flags; +}; + +gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level); +struct html_tag * get_tag_by_name (const char *name); + +#endif diff --git a/src/message.c b/src/message.c index 5d344db62..3024377d5 100644 --- a/src/message.c +++ b/src/message.c @@ -27,15 +27,17 @@ #include "main.h" #include "message.h" #include "cfg_file.h" +#include "html.h" #include "modules.h" GByteArray* -strip_html_tags (GByteArray *src, int *stateptr) +strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr) { - uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc; + uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin, c, lc; int br, i = 0, depth = 0, in_q = 0; int state = 0; GByteArray *buf; + GNode *level_ptr = NULL; if (stateptr) state = *stateptr; @@ -59,6 +61,7 @@ strip_html_tags (GByteArray *src, int *stateptr) } if (state == 0) { lc = '<'; + tbegin = p + 1; state = 1; } else if (state == 1) { depth++; @@ -101,7 +104,9 @@ strip_html_tags (GByteArray *src, int *stateptr) case 1: /* HTML/XML */ lc = '>'; in_q = state = 0; - + *p = '\0'; + add_html_node (pool, part, tbegin, &level_ptr); + *p = '>'; break; case 2: /* PHP */ @@ -220,9 +225,15 @@ reg_char: *rp = '\0'; g_byte_array_set_size (buf, rp - buf->data); } + + /* Check tag balancing */ + if (level_ptr && level_ptr->data != NULL) { + part->is_balanced = FALSE; + } - if (stateptr) + if (stateptr) { *stateptr = state; + } return buf; } @@ -287,8 +298,10 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); - text_part->content = strip_html_tags (part_content, NULL); text_part->is_html = TRUE; + text_part->is_balanced = TRUE; + text_part->html_nodes = NULL; + text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL); text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); task->text_parts = g_list_prepend (task->text_parts, text_part); @@ -591,10 +604,12 @@ process_learn (struct controller_session *session) return 0; } +/* + * XXX: remove this function for learning + */ GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) { - GByteArray *ret = NULL; struct mime_part *p; if (*cur == NULL) { @@ -611,6 +626,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) msg_debug ("get_next_text_part: text/plain part"); return p->content; } +#if 0 else if (g_mime_content_type_is_type (p->type, "text", "html")) { msg_debug ("get_next_text_part: try to strip html tags"); ret = strip_html_tags (p->content, NULL); @@ -623,6 +639,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret); return ret; } +#endif *cur = g_list_next (*cur); } diff --git a/src/message.h b/src/message.h index 9e9b5de1f..72711638f 100644 --- a/src/message.h +++ b/src/message.h @@ -17,8 +17,10 @@ struct mime_part { struct mime_text_part { gboolean is_html; gboolean is_raw; + gboolean is_balanced; GByteArray *orig; GByteArray *content; + GNode *html_nodes; fuzzy_hash_t *fuzzy; }; -- 2.39.5