aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-05-15 18:15:54 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-05-15 18:15:54 +0400
commit8647250389da44e3cec0f9f7c0c2e4c47c93195c (patch)
tree4e34957983d08c3f8d7ba41e23770ed09c39aaf5
parent784dbf335644c385fb0f3a1fae70e3886f3b6f6e (diff)
downloadrspamd-8647250389da44e3cec0f9f7c0c2e4c47c93195c.tar.gz
rspamd-8647250389da44e3cec0f9f7c0c2e4c47c93195c.zip
* Add simple html parser and tag balancing detector
* Add function for searching html tag
-rw-r--r--CMakeLists.txt5
-rw-r--r--README.utf8.txt2
-rw-r--r--src/expressions.c91
-rw-r--r--src/html.c319
-rw-r--r--src/html.h210
-rw-r--r--src/message.c29
-rw-r--r--src/message.h2
7 files changed, 651 insertions, 7 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e346370d1..f744fff03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ PROJECT(rspamd C)
SET(RSPAMD_VERSION_MAJOR 0)
SET(RSPAMD_VERSION_MINOR 1)
-SET(RSPAMD_VERSION_PATCH 1)
+SET(RSPAMD_VERSION_PATCH 2)
SET(RSPAMD_VERSION "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}")
SET(RSPAMD_MASTER_SITE_URL "http://cebka.pp.ru/hg/rspamd")
@@ -304,6 +304,7 @@ SET(RSPAMDSRC src/modules.c
src/controller.c
src/cfg_utils.c
src/buffer.c
+ src/html.c
src/lmtp.c
src/lmtp_proto.c)
@@ -339,6 +340,7 @@ SET(TESTDEPENDS src/mem_pool.c
src/fuzzy.c
src/memcached.c
src/message.c
+ src/html.c
src/expressions.c
src/statfile.c)
@@ -351,6 +353,7 @@ SET(UTILSDEPENDS src/mem_pool.c
src/fuzzy.c
src/expressions.c
src/message.c
+ src/html.c
src/util.c)
LIST(LENGTH PLUGINSSRC RSPAMD_MODULES_NUM)
diff --git a/README.utf8.txt b/README.utf8.txt
index a52e380f4..b27e2876f 100644
--- a/README.utf8.txt
+++ b/README.utf8.txt
@@ -156,6 +156,8 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})"
* has_only_html_part - функция возвращает TRUE, если в сообщении есть только одна HTML часть
* compare_recipients_distance - вычисляет процент схожих получателей письма. Принимает аргумент - порог в процентах похожести.
* is_recipients_sorted - возвращает TRUE, если список получателей сортирован (работает только если число получателей >= 5).
+ * is_html_balanced - возвращает TRUE, если теги всех html частей сбалансированы
+ * has_html_tag - возвращает TRUE, если заданный html тег найден
Модуль chartable.
================
diff --git a/src/expressions.c b/src/expressions.c
index c7b88adb9..05bc12e88 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -29,6 +29,7 @@
#include "message.h"
#include "fuzzy.h"
#include "expressions.h"
+#include "html.h"
gboolean rspamd_compare_encoding (struct worker_task *task, GList *args);
gboolean rspamd_header_exists (struct worker_task *task, GList *args);
@@ -43,6 +44,8 @@ gboolean rspamd_has_content_part_len (struct worker_task *task, GList *args);
gboolean rspamd_has_only_html_part (struct worker_task *task, GList *args);
gboolean rspamd_is_recipients_sorted (struct worker_task *task, GList *args);
gboolean rspamd_compare_transfer_encoding (struct worker_task *task, GList *args);
+gboolean rspamd_is_html_balanced (struct worker_task *task, GList *args);
+gboolean rspamd_has_html_tag (struct worker_task *task, GList *args);
/*
* List of internal functions of rspamd
@@ -62,8 +65,10 @@ static struct _fl {
{ "content_type_is_type", rspamd_content_type_is_type },
{ "has_content_part", rspamd_has_content_part },
{ "has_content_part_len", rspamd_has_content_part_len },
+ { "has_html_tag", rspamd_has_html_tag },
{ "has_only_html_part", rspamd_has_only_html_part },
{ "header_exists", rspamd_header_exists },
+ { "is_html_balanced", rspamd_is_html_balanced },
{ "is_recipients_sorted", rspamd_is_recipients_sorted },
};
@@ -1523,6 +1528,92 @@ rspamd_compare_transfer_encoding (struct worker_task *task, GList *args)
return FALSE;
}
+gboolean
+rspamd_is_html_balanced (struct worker_task *task, GList *args)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ gboolean res = TRUE;
+
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ p = cur->data;
+ if (p->is_html) {
+ if (p->is_balanced) {
+ res = TRUE;
+ }
+ else {
+ res = FALSE;
+ break;
+ }
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+
+}
+
+struct html_callback_data {
+ struct html_tag *tag;
+ gboolean *res;
+};
+
+static gboolean
+search_html_node_callback (GNode *node, gpointer data)
+{
+ struct html_callback_data *cd = data;
+ struct html_node *nd;
+
+ nd = node->data;
+ if (nd) {
+ if (nd->tag == cd->tag) {
+ *cd->res = TRUE;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_has_html_tag (struct worker_task *task, GList *args)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ struct expression_argument *arg;
+ struct html_tag *tag;
+ gboolean res = FALSE;
+ struct html_callback_data cd;
+
+ if (args == NULL) {
+ msg_warn ("rspamd_has_html_tag: no parameters to function");
+ return FALSE;
+ }
+
+ arg = get_function_arg (args->data, task, TRUE);
+ tag = get_tag_by_name (arg->data);
+ if (tag == NULL) {
+ msg_warn ("rspamd_has_html_tag: unknown tag type passed as argument: %s", (char *)arg->data);
+ return FALSE;
+ }
+
+ cur = g_list_first (task->text_parts);
+ cd.res = &res;
+ cd.tag = tag;
+
+ while (cur && res == FALSE) {
+ p = cur->data;
+ if (p->is_html && p->html_nodes) {
+ g_node_traverse (p->html_nodes, G_PRE_ORDER, G_TRAVERSE_ALL, -1, search_html_node_callback, &cd);
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+
+}
+
/*
* vi:ts=4
*/
diff --git a/src/html.c b/src/html.c
new file mode 100644
index 000000000..9a816c4bd
--- /dev/null
+++ b/src/html.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util.h"
+#include "main.h"
+#include "message.h"
+#include "html.h"
+
+sig_atomic_t tags_sorted = 0;
+
+static struct html_tag tag_defs[] =
+{
+ /* W3C defined elements */
+ { Tag_A, "a", (CM_INLINE)},
+ { Tag_ABBR, "abbr", (CM_INLINE)},
+ { Tag_ACRONYM, "acronym", (CM_INLINE)},
+ { Tag_ADDRESS, "address", (CM_BLOCK)},
+ { Tag_APPLET, "applet", (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)},
+ { Tag_AREA, "area", (CM_BLOCK|CM_EMPTY)},
+ { Tag_B, "b", (CM_INLINE)},
+ { Tag_BASE, "base", (CM_HEAD|CM_EMPTY)},
+ { Tag_BASEFONT, "basefont", (CM_INLINE|CM_EMPTY)},
+ { Tag_BDO, "bdo", (CM_INLINE)},
+ { Tag_BIG, "big", (CM_INLINE)},
+ { Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)},
+ { Tag_BODY, "body", (CM_HTML|CM_OPT|CM_OMITST)},
+ { Tag_BR, "br", (CM_INLINE|CM_EMPTY)},
+ { Tag_BUTTON, "button", (CM_INLINE)},
+ { Tag_CAPTION, "caption", (CM_TABLE)},
+ { Tag_CENTER, "center", (CM_BLOCK)},
+ { Tag_CITE, "cite", (CM_INLINE)},
+ { Tag_CODE, "code", (CM_INLINE)},
+ { Tag_COL, "col", (CM_TABLE|CM_EMPTY)},
+ { Tag_COLGROUP, "colgroup", (CM_TABLE|CM_OPT)},
+ { Tag_DD, "dd", (CM_DEFLIST|CM_OPT|CM_NO_INDENT)},
+ { Tag_DEL, "del", (CM_INLINE|CM_BLOCK|CM_MIXED)},
+ { Tag_DFN, "dfn", (CM_INLINE)},
+ { Tag_DIR, "dir", (CM_BLOCK|CM_OBSOLETE)},
+ { Tag_DIV, "div", (CM_BLOCK)},
+ { Tag_DL, "dl", (CM_BLOCK)},
+ { Tag_DT, "dt", (CM_DEFLIST|CM_OPT|CM_NO_INDENT)},
+ { Tag_EM, "em", (CM_INLINE)},
+ { Tag_FIELDSET, "fieldset", (CM_BLOCK)},
+ { Tag_FONT, "font", (CM_INLINE)},
+ { Tag_FORM, "form", (CM_BLOCK)},
+ { Tag_FRAME, "frame", (CM_FRAMES|CM_EMPTY)},
+ { Tag_FRAMESET, "frameset", (CM_HTML|CM_FRAMES)},
+ { Tag_H1, "h1", (CM_BLOCK|CM_HEADING)},
+ { Tag_H2, "h2", (CM_BLOCK|CM_HEADING)},
+ { Tag_H3, "h3", (CM_BLOCK|CM_HEADING)},
+ { Tag_H4, "h4", (CM_BLOCK|CM_HEADING)},
+ { Tag_H5, "h5", (CM_BLOCK|CM_HEADING)},
+ { Tag_H6, "h6", (CM_BLOCK|CM_HEADING)},
+ { Tag_HEAD, "head", (CM_HTML|CM_OPT|CM_OMITST)},
+ { Tag_HR, "hr", (CM_BLOCK|CM_EMPTY)},
+ { Tag_HTML, "html", (CM_HTML|CM_OPT|CM_OMITST)},
+ { Tag_I, "i", (CM_INLINE)},
+ { Tag_IFRAME, "iframe", (CM_INLINE)},
+ { Tag_IMG, "img", (CM_INLINE|CM_IMG|CM_EMPTY)},
+ { Tag_INPUT, "input", (CM_INLINE|CM_IMG|CM_EMPTY)},
+ { Tag_INS, "ins", (CM_INLINE|CM_BLOCK|CM_MIXED)},
+ { Tag_ISINDEX, "isindex", (CM_BLOCK|CM_EMPTY)},
+ { Tag_KBD, "kbd", (CM_INLINE)},
+ { Tag_LABEL, "label", (CM_INLINE)},
+ { Tag_LEGEND, "legend", (CM_INLINE)},
+ { Tag_LI, "li", (CM_LIST|CM_OPT|CM_NO_INDENT)},
+ { Tag_LINK, "link", (CM_HEAD|CM_EMPTY)},
+ { Tag_LISTING, "listing", (CM_BLOCK|CM_OBSOLETE)},
+ { Tag_MAP, "map", (CM_INLINE)},
+ { Tag_MENU, "menu", (CM_BLOCK|CM_OBSOLETE)},
+ { Tag_META, "meta", (CM_HEAD|CM_EMPTY)},
+ { Tag_NOFRAMES, "noframes", (CM_BLOCK|CM_FRAMES)},
+ { Tag_NOSCRIPT, "noscript", (CM_BLOCK|CM_INLINE|CM_MIXED)},
+ { Tag_OBJECT, "object", (CM_OBJECT|CM_HEAD|CM_IMG|CM_INLINE|CM_PARAM)},
+ { Tag_OL, "ol", (CM_BLOCK)},
+ { Tag_OPTGROUP, "optgroup", (CM_FIELD|CM_OPT)},
+ { Tag_OPTION, "option", (CM_FIELD|CM_OPT)},
+ { Tag_P, "p", (CM_BLOCK|CM_OPT)},
+ { Tag_PARAM, "param", (CM_INLINE|CM_EMPTY)},
+ { Tag_PLAINTEXT, "plaintext", (CM_BLOCK|CM_OBSOLETE)},
+ { Tag_PRE, "pre", (CM_BLOCK)},
+ { Tag_Q, "q", (CM_INLINE)},
+ { Tag_RB, "rb", (CM_INLINE)},
+ { Tag_RBC, "rbc", (CM_INLINE)},
+ { Tag_RP, "rp", (CM_INLINE)},
+ { Tag_RT, "rt", (CM_INLINE)},
+ { Tag_RTC, "rtc", (CM_INLINE)},
+ { Tag_RUBY, "ruby", (CM_INLINE)},
+ { Tag_S, "s", (CM_INLINE)},
+ { Tag_SAMP, "samp", (CM_INLINE)},
+ { Tag_SCRIPT, "script", (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)},
+ { Tag_SELECT, "select", (CM_INLINE|CM_FIELD)},
+ { Tag_SMALL, "small", (CM_INLINE)},
+ { Tag_SPAN, "span", (CM_INLINE)},
+ { Tag_STRIKE, "strike", (CM_INLINE)},
+ { Tag_STRONG, "strong", (CM_INLINE)},
+ { Tag_STYLE, "style", (CM_HEAD)},
+ { Tag_SUB, "sub", (CM_INLINE)},
+ { Tag_SUP, "sup", (CM_INLINE)},
+ { Tag_TABLE, "table", (CM_BLOCK)},
+ { Tag_TBODY, "tbody", (CM_TABLE|CM_ROWGRP|CM_OPT)},
+ { Tag_TD, "td", (CM_ROW|CM_OPT|CM_NO_INDENT)},
+ { Tag_TEXTAREA, "textarea", (CM_INLINE|CM_FIELD)},
+ { Tag_TFOOT, "tfoot", (CM_TABLE|CM_ROWGRP|CM_OPT)},
+ { Tag_TH, "th", (CM_ROW|CM_OPT|CM_NO_INDENT)},
+ { Tag_THEAD, "thead", (CM_TABLE|CM_ROWGRP|CM_OPT)},
+ { Tag_TITLE, "title", (CM_HEAD)},
+ { Tag_TR, "tr", (CM_TABLE|CM_OPT)},
+ { Tag_TT, "tt", (CM_INLINE)},
+ { Tag_U, "u", (CM_INLINE)},
+ { Tag_UL, "ul", (CM_BLOCK)},
+ { Tag_VAR, "var", (CM_INLINE)},
+ { Tag_XMP, "xmp", (CM_BLOCK|CM_OBSOLETE)},
+ { Tag_NEXTID, "nextid", (CM_HEAD|CM_EMPTY)},
+
+ /* proprietary elements */
+ { Tag_ALIGN, "align", (CM_BLOCK)},
+ { Tag_BGSOUND, "bgsound", (CM_HEAD|CM_EMPTY)},
+ { Tag_BLINK, "blink", (CM_INLINE)},
+ { Tag_COMMENT, "comment", (CM_INLINE)},
+ { Tag_EMBED, "embed", (CM_INLINE|CM_IMG|CM_EMPTY)},
+ { Tag_ILAYER, "ilayer", (CM_INLINE)},
+ { Tag_KEYGEN, "keygen", (CM_INLINE|CM_EMPTY)},
+ { Tag_LAYER, "layer", (CM_BLOCK)},
+ { Tag_MARQUEE, "marquee", (CM_INLINE|CM_OPT)},
+ { Tag_MULTICOL, "multicol", (CM_BLOCK)},
+ { Tag_NOBR, "nobr", (CM_INLINE)},
+ { Tag_NOEMBED, "noembed", (CM_INLINE)},
+ { Tag_NOLAYER, "nolayer", (CM_BLOCK|CM_INLINE|CM_MIXED)},
+ { Tag_NOSAVE, "nosave", (CM_BLOCK)},
+ { Tag_SERVER, "server", (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)},
+ { Tag_SERVLET, "servlet", (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)},
+ { Tag_SPACER, "spacer", (CM_INLINE|CM_EMPTY)},
+ { Tag_WBR, "wbr", (CM_INLINE|CM_EMPTY)},
+};
+
+static int
+tag_cmp (const void *m1, const void *m2)
+{
+ const struct html_tag *p1 = m1;
+ const struct html_tag *p2 = m2;
+
+ return g_ascii_strcasecmp (p1->name, p2->name);
+}
+
+static GNode*
+construct_html_node (memory_pool_t *pool, char *text)
+{
+ struct html_node *html;
+ GNode *n = NULL;
+ struct html_tag key, *found;
+ char t;
+ int taglen = strlen (text);
+
+ if (text == NULL || *text == '\0') {
+ return NULL;
+ }
+
+ html = memory_pool_alloc0 (pool, sizeof (struct html_node));
+
+ /* Check whether this tag is fully closed */
+ if (*(text + taglen - 1) == '/') {
+ html->flags |= FL_CLOSED;
+ }
+
+ /* Check xml tag */
+ if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) {
+ html->flags |= FL_XML;
+ html->tag = NULL;
+ }
+ else {
+ if (*text == '/') {
+ html->flags |= FL_CLOSING;
+ text ++;
+ }
+
+ /* Find end of tag name */
+ key.name = text;
+ while (*text && g_ascii_isalnum (*(++text)));
+
+ t = *text;
+ *text = '\0';
+
+ /* Match tag id by tag name */
+ if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) {
+ *text = t;
+ html->tag = found;
+ }
+ else {
+ *text = t;
+ return NULL;
+ }
+ }
+
+ n = g_node_new (html);
+
+ return n;
+}
+
+static gboolean
+check_balance (GNode *node, GNode **cur_level)
+{
+ struct html_node *arg = node->data, *tmp;
+ GNode *cur;
+
+ if (arg->flags & FL_CLOSING) {
+ /* First of all check whether this tag is closing tag for parent node */
+ cur = node->parent;
+ while (cur && cur->data) {
+ tmp = cur->data;
+ if (tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) {
+ msg_debug ("check_balance: found closing tag for parent '%s'", tmp->tag->name);
+ tmp->flags |= FL_CLOSED;
+ /* Destroy current node as we find corresponding parent node */
+ g_node_destroy (node);
+ /* Change level */
+ *cur_level = cur->parent;
+ return TRUE;
+ }
+ cur = cur->parent;
+ }
+ }
+ else {
+ return TRUE;
+ }
+
+ msg_debug ("check_balance: found unbalanced tag %s", arg->tag->name);
+ return FALSE;
+}
+
+struct html_tag *
+get_tag_by_name (const char *name)
+{
+ struct html_tag key;
+
+ key.name = name;
+
+ return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+}
+
+gboolean
+add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
+{
+ GNode *new;
+ struct html_node *data;
+
+ if (!tags_sorted) {
+ qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+ tags_sorted = 1;
+ }
+
+ /* First call of this function */
+ if (part->html_nodes == NULL) {
+ /* Insert root node */
+ new = g_node_new (NULL);
+ *cur_level = new;
+ part->html_nodes = new;
+ memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes);
+ /* Call once again with root node */
+ return add_html_node (pool, part, tag_text, cur_level);
+ }
+ else {
+ new = construct_html_node (pool, tag_text);
+ if (new == NULL) {
+ msg_debug ("add_html_node: cannot construct HTML node for text '%s'", tag_text);
+ return -1;
+ }
+ data = new->data;
+ if (data->flags & FL_CLOSING) {
+ if (! *cur_level) {
+ msg_debug ("add_html_node: bad parent node");
+ return FALSE;
+ }
+ g_node_append (*cur_level, new);
+ if (!check_balance (new, cur_level)) {
+ msg_debug ("add_html_node: mark part as unbalanced as it has not pairable closing tags");
+ part->is_balanced = FALSE;
+ }
+ }
+ else {
+ g_node_append (*cur_level, new);
+ if ((data->flags & FL_CLOSED) == 0) {
+ msg_debug ("add_html_node: append opening tag: '%s'", data->tag->name);
+ *cur_level = new;
+ }
+ else {
+ msg_debug ("add_html_node: append closed tag: '%s'", data->tag->name);
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/html.h b/src/html.h
new file mode 100644
index 000000000..70f20de49
--- /dev/null
+++ b/src/html.h
@@ -0,0 +1,210 @@
+/*
+ * Functions for simple html parsing
+ */
+
+#ifndef RSPAMD_HTML_H
+#define RSPAMD_HTML_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+/* Known HTML tags */
+typedef enum
+{
+ Tag_UNKNOWN, /**< Unknown tag! */
+ Tag_A, /**< A */
+ Tag_ABBR, /**< ABBR */
+ Tag_ACRONYM, /**< ACRONYM */
+ Tag_ADDRESS, /**< ADDRESS */
+ Tag_ALIGN, /**< ALIGN */
+ Tag_APPLET, /**< APPLET */
+ Tag_AREA, /**< AREA */
+ Tag_B, /**< B */
+ Tag_BASE, /**< BASE */
+ Tag_BASEFONT, /**< BASEFONT */
+ Tag_BDO, /**< BDO */
+ Tag_BGSOUND, /**< BGSOUND */
+ Tag_BIG, /**< BIG */
+ Tag_BLINK, /**< BLINK */
+ Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
+ Tag_BODY, /**< BODY */
+ Tag_BR, /**< BR */
+ Tag_BUTTON, /**< BUTTON */
+ Tag_CAPTION, /**< CAPTION */
+ Tag_CENTER, /**< CENTER */
+ Tag_CITE, /**< CITE */
+ Tag_CODE, /**< CODE */
+ Tag_COL, /**< COL */
+ Tag_COLGROUP, /**< COLGROUP */
+ Tag_COMMENT, /**< COMMENT */
+ Tag_DD, /**< DD */
+ Tag_DEL, /**< DEL */
+ Tag_DFN, /**< DFN */
+ Tag_DIR, /**< DIR */
+ Tag_DIV, /**< DIF */
+ Tag_DL, /**< DL */
+ Tag_DT, /**< DT */
+ Tag_EM, /**< EM */
+ Tag_EMBED, /**< EMBED */
+ Tag_FIELDSET, /**< FIELDSET */
+ Tag_FONT, /**< FONT */
+ Tag_FORM, /**< FORM */
+ Tag_FRAME, /**< FRAME */
+ Tag_FRAMESET, /**< FRAMESET */
+ Tag_H1, /**< H1 */
+ Tag_H2, /**< H2 */
+ Tag_H3, /**< H3 */
+ Tag_H4, /**< H4 */
+ Tag_H5, /**< H5 */
+ Tag_H6, /**< H6 */
+ Tag_HEAD, /**< HEAD */
+ Tag_HR, /**< HR */
+ Tag_HTML, /**< HTML */
+ Tag_I, /**< I */
+ Tag_IFRAME, /**< IFRAME */
+ Tag_ILAYER, /**< ILAYER */
+ Tag_IMG, /**< IMG */
+ Tag_INPUT, /**< INPUT */
+ Tag_INS, /**< INS */
+ Tag_ISINDEX, /**< ISINDEX */
+ Tag_KBD, /**< KBD */
+ Tag_KEYGEN, /**< KEYGEN */
+ Tag_LABEL, /**< LABEL */
+ Tag_LAYER, /**< LAYER */
+ Tag_LEGEND, /**< LEGEND */
+ Tag_LI, /**< LI */
+ Tag_LINK, /**< LINK */
+ Tag_LISTING, /**< LISTING */
+ Tag_MAP, /**< MAP */
+ Tag_MARQUEE, /**< MARQUEE */
+ Tag_MENU, /**< MENU */
+ Tag_META, /**< META */
+ Tag_MULTICOL, /**< MULTICOL */
+ Tag_NOBR, /**< NOBR */
+ Tag_NOEMBED, /**< NOEMBED */
+ Tag_NOFRAMES, /**< NOFRAMES */
+ Tag_NOLAYER, /**< NOLAYER */
+ Tag_NOSAVE, /**< NOSAVE */
+ Tag_NOSCRIPT, /**< NOSCRIPT */
+ Tag_OBJECT, /**< OBJECT */
+ Tag_OL, /**< OL */
+ Tag_OPTGROUP, /**< OPTGROUP */
+ Tag_OPTION, /**< OPTION */
+ Tag_P, /**< P */
+ Tag_PARAM, /**< PARAM */
+ Tag_PLAINTEXT,/**< PLAINTEXT */
+ Tag_PRE, /**< PRE */
+ Tag_Q, /**< Q */
+ Tag_RB, /**< RB */
+ Tag_RBC, /**< RBC */
+ Tag_RP, /**< RP */
+ Tag_RT, /**< RT */
+ Tag_RTC, /**< RTC */
+ Tag_RUBY, /**< RUBY */
+ Tag_S, /**< S */
+ Tag_SAMP, /**< SAMP */
+ Tag_SCRIPT, /**< SCRIPT */
+ Tag_SELECT, /**< SELECT */
+ Tag_SERVER, /**< SERVER */
+ Tag_SERVLET, /**< SERVLET */
+ Tag_SMALL, /**< SMALL */
+ Tag_SPACER, /**< SPACER */
+ Tag_SPAN, /**< SPAN */
+ Tag_STRIKE, /**< STRIKE */
+ Tag_STRONG, /**< STRONG */
+ Tag_STYLE, /**< STYLE */
+ Tag_SUB, /**< SUB */
+ Tag_SUP, /**< SUP */
+ Tag_TABLE, /**< TABLE */
+ Tag_TBODY, /**< TBODY */
+ Tag_TD, /**< TD */
+ Tag_TEXTAREA, /**< TEXTAREA */
+ Tag_TFOOT, /**< TFOOT */
+ Tag_TH, /**< TH */
+ Tag_THEAD, /**< THEAD */
+ Tag_TITLE, /**< TITLE */
+ Tag_TR, /**< TR */
+ Tag_TT, /**< TT */
+ Tag_U, /**< U */
+ Tag_UL, /**< UL */
+ Tag_VAR, /**< VAR */
+ Tag_WBR, /**< WBR */
+ Tag_XMP, /**< XMP */
+ Tag_XML, /**< XML */
+ Tag_NEXTID, /**< NEXTID */
+
+ N_TAGS /**< Must be last */
+} tag_id_t;
+
+#define CM_UNKNOWN 0
+/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY (1 << 0)
+/* Elements that appear outside of "BODY". */
+#define CM_HTML (1 << 1)
+/* Elements that can appear within HEAD. */
+#define CM_HEAD (1 << 2)
+/* HTML "block" elements. */
+#define CM_BLOCK (1 << 3)
+/* HTML "inline" elements. */
+#define CM_INLINE (1 << 4)
+/* Elements that mark list item ("LI"). */
+#define CM_LIST (1 << 5)
+/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_DEFLIST (1 << 6)
+/* Elements that can appear inside TABLE. */
+#define CM_TABLE (1 << 7)
+/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROWGRP (1 << 8)
+/* Used for "TD", "TH" */
+#define CM_ROW (1 << 9)
+/* Elements whose content must be protected against white space movement.
+ Includes some elements that can found in forms. */
+#define CM_FIELD (1 << 10)
+/* Used to avoid propagating inline emphasis inside some elements
+ such as OBJECT or APPLET. */
+#define CM_OBJECT (1 << 11)
+/* Elements that allows "PARAM". */
+#define CM_PARAM (1 << 12)
+/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
+#define CM_FRAMES (1 << 13)
+/* Heading elements (h1, h2, ...). */
+#define CM_HEADING (1 << 14)
+/* Elements with an optional end tag. */
+#define CM_OPT (1 << 15)
+/* Elements that use "align" attribute for vertical position. */
+#define CM_IMG (1 << 16)
+/* Elements with inline and block model. Used to avoid calling InlineDup. */
+#define CM_MIXED (1 << 17)
+/* Elements whose content needs to be indented only if containing one
+ CM_BLOCK element. */
+#define CM_NO_INDENT (1 << 18)
+/* Elements that are obsolete (such as "dir", "menu"). */
+#define CM_OBSOLETE (1 << 19)
+/* User defined elements. Used to determine how attributes wihout value
+ should be printed. */
+#define CM_NEW (1 << 20)
+/* Elements that cannot be omitted. */
+#define CM_OMITST (1 << 21)
+
+/* XML tag */
+#define FL_XML (1 << 0)
+/* Closing tag */
+#define FL_CLOSING (1 << 1)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED (1 << 2)
+
+struct html_tag {
+ tag_id_t id;
+ const char *name;
+ int flags;
+};
+
+struct html_node {
+ struct html_tag *tag;
+ int flags;
+};
+
+gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
+struct html_tag * get_tag_by_name (const char *name);
+
+#endif
diff --git a/src/message.c b/src/message.c
index 5d344db62..3024377d5 100644
--- a/src/message.c
+++ b/src/message.c
@@ -27,15 +27,17 @@
#include "main.h"
#include "message.h"
#include "cfg_file.h"
+#include "html.h"
#include "modules.h"
GByteArray*
-strip_html_tags (GByteArray *src, int *stateptr)
+strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
{
- uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc;
+ uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin, c, lc;
int br, i = 0, depth = 0, in_q = 0;
int state = 0;
GByteArray *buf;
+ GNode *level_ptr = NULL;
if (stateptr)
state = *stateptr;
@@ -59,6 +61,7 @@ strip_html_tags (GByteArray *src, int *stateptr)
}
if (state == 0) {
lc = '<';
+ tbegin = p + 1;
state = 1;
} else if (state == 1) {
depth++;
@@ -101,7 +104,9 @@ strip_html_tags (GByteArray *src, int *stateptr)
case 1: /* HTML/XML */
lc = '>';
in_q = state = 0;
-
+ *p = '\0';
+ add_html_node (pool, part, tbegin, &level_ptr);
+ *p = '>';
break;
case 2: /* PHP */
@@ -220,9 +225,15 @@ reg_char:
*rp = '\0';
g_byte_array_set_size (buf, rp - buf->data);
}
+
+ /* Check tag balancing */
+ if (level_ptr && level_ptr->data != NULL) {
+ part->is_balanced = FALSE;
+ }
- if (stateptr)
+ if (stateptr) {
*stateptr = state;
+ }
return buf;
}
@@ -287,8 +298,10 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
- text_part->content = strip_html_tags (part_content, NULL);
text_part->is_html = TRUE;
+ text_part->is_balanced = TRUE;
+ text_part->html_nodes = NULL;
+ text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
task->text_parts = g_list_prepend (task->text_parts, text_part);
@@ -591,10 +604,12 @@ process_learn (struct controller_session *session)
return 0;
}
+/*
+ * XXX: remove this function for learning
+ */
GByteArray*
get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
{
- GByteArray *ret = NULL;
struct mime_part *p;
if (*cur == NULL) {
@@ -611,6 +626,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
msg_debug ("get_next_text_part: text/plain part");
return p->content;
}
+#if 0
else if (g_mime_content_type_is_type (p->type, "text", "html")) {
msg_debug ("get_next_text_part: try to strip html tags");
ret = strip_html_tags (p->content, NULL);
@@ -623,6 +639,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
return ret;
}
+#endif
*cur = g_list_next (*cur);
}
diff --git a/src/message.h b/src/message.h
index 9e9b5de1f..72711638f 100644
--- a/src/message.h
+++ b/src/message.h
@@ -17,8 +17,10 @@ struct mime_part {
struct mime_text_part {
gboolean is_html;
gboolean is_raw;
+ gboolean is_balanced;
GByteArray *orig;
GByteArray *content;
+ GNode *html_nodes;
fuzzy_hash_t *fuzzy;
};