7 files changed, 651 insertions, 7 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e346370d1..f744fff03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ PROJECT(rspamd C)
 
 SET(RSPAMD_VERSION_MAJOR 0)
 SET(RSPAMD_VERSION_MINOR 1)
-SET(RSPAMD_VERSION_PATCH 1)
+SET(RSPAMD_VERSION_PATCH 2)
 
 SET(RSPAMD_VERSION         "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}")
 SET(RSPAMD_MASTER_SITE_URL "http://cebka.pp.ru/hg/rspamd")
@@ -304,6 +304,7 @@ SET(RSPAMDSRC	src/modules.c
 				src/controller.c
 				src/cfg_utils.c
 				src/buffer.c
+				src/html.c
 				src/lmtp.c
 				src/lmtp_proto.c)
 
@@ -339,6 +340,7 @@ SET(TESTDEPENDS	src/mem_pool.c
 				src/fuzzy.c
 				src/memcached.c
 				src/message.c
+				src/html.c
 				src/expressions.c
 				src/statfile.c)
 
@@ -351,6 +353,7 @@ SET(UTILSDEPENDS src/mem_pool.c
 				src/fuzzy.c
 				src/expressions.c
 				src/message.c
+				src/html.c
 				src/util.c)
 
 LIST(LENGTH PLUGINSSRC RSPAMD_MODULES_NUM)
diff --git a/README.utf8.txt b/README.utf8.txt
index a52e380f4..b27e2876f 100644
--- a/README.utf8.txt
+++ b/README.utf8.txt
@@ -156,6 +156,8 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})"
   * has_only_html_part - функция возвращает TRUE, если в сообщении есть только одна HTML часть
   * compare_recipients_distance - вычисляет процент схожих получателей письма. Принимает аргумент - порог в процентах похожести.
   * is_recipients_sorted - возвращает TRUE, если список получателей сортирован (работает только если число получателей >= 5).
+  * is_html_balanced - возвращает TRUE, если теги всех html частей сбалансированы
+  * has_html_tag - возвращает TRUE, если заданный html тег найден
 
 Модуль chartable.
 ================
diff --git a/src/expressions.c b/src/expressions.c
index c7b88adb9..05bc12e88 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -29,6 +29,7 @@
 #include "message.h"
 #include "fuzzy.h"
 #include "expressions.h"
+#include "html.h"
 
 gboolean rspamd_compare_encoding (struct worker_task *task, GList *args);
 gboolean rspamd_header_exists (struct worker_task *task, GList *args);
@@ -43,6 +44,8 @@ gboolean rspamd_has_content_part_len (struct worker_task *task, GList *args);
 gboolean rspamd_has_only_html_part (struct worker_task *task, GList *args);
 gboolean rspamd_is_recipients_sorted (struct worker_task *task, GList *args);
 gboolean rspamd_compare_transfer_encoding (struct worker_task *task, GList *args);
+gboolean rspamd_is_html_balanced (struct worker_task *task, GList *args);
+gboolean rspamd_has_html_tag (struct worker_task *task, GList *args);
 
 /*
  * List of internal functions of rspamd
@@ -62,8 +65,10 @@ static struct _fl {
 	{ "content_type_is_type", rspamd_content_type_is_type },
 	{ "has_content_part", rspamd_has_content_part },
 	{ "has_content_part_len", rspamd_has_content_part_len },
+	{ "has_html_tag", rspamd_has_html_tag },
 	{ "has_only_html_part", rspamd_has_only_html_part },
 	{ "header_exists", rspamd_header_exists },
+	{ "is_html_balanced", rspamd_is_html_balanced },
 	{ "is_recipients_sorted", rspamd_is_recipients_sorted },
 };
 
@@ -1523,6 +1528,92 @@ rspamd_compare_transfer_encoding (struct worker_task *task, GList *args)
 	return FALSE;
 }
 
+gboolean 
+rspamd_is_html_balanced (struct worker_task *task, GList *args)
+{
+	struct mime_text_part *p;
+	GList *cur;
+	gboolean res = TRUE;
+
+	cur = g_list_first (task->text_parts);
+	while (cur) {
+		p = cur->data;
+		if (p->is_html) {
+			if (p->is_balanced) {
+				res = TRUE;
+			}
+			else {
+				res = FALSE;
+				break;
+			}
+		}
+		cur = g_list_next (cur);
+	}
+
+	return res;
+
+}
+
+struct html_callback_data {
+	struct html_tag *tag;
+	gboolean *res;
+};
+
+static gboolean
+search_html_node_callback (GNode *node, gpointer data)
+{
+	struct html_callback_data *cd = data;
+	struct html_node *nd;
+	
+	nd = node->data;
+	if (nd) {
+		if (nd->tag == cd->tag) {
+			*cd->res = TRUE;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+gboolean 
+rspamd_has_html_tag (struct worker_task *task, GList *args)
+{
+	struct mime_text_part *p;
+	GList *cur;
+	struct expression_argument *arg;
+	struct html_tag *tag;
+	gboolean res = FALSE;
+	struct html_callback_data cd;
+	
+	if (args == NULL) {
+		msg_warn ("rspamd_has_html_tag: no parameters to function");
+		return FALSE;
+	}
+	
+	arg = get_function_arg (args->data, task, TRUE);
+	tag = get_tag_by_name (arg->data);
+	if (tag == NULL) {
+		msg_warn ("rspamd_has_html_tag: unknown tag type passed as argument: %s", (char *)arg->data);
+		return FALSE;
+	}
+
+	cur = g_list_first (task->text_parts);
+	cd.res = &res;
+	cd.tag = tag;
+
+	while (cur && res == FALSE) {
+		p = cur->data;
+		if (p->is_html && p->html_nodes) {
+			g_node_traverse (p->html_nodes, G_PRE_ORDER, G_TRAVERSE_ALL, -1, search_html_node_callback, &cd);
+		}
+		cur = g_list_next (cur);
+	}
+
+	return res;
+
+}
+
 /*
  * vi:ts=4
  */
diff --git a/src/html.c b/src/html.c
new file mode 100644
index 000000000..9a816c4bd
--- /dev/null
+++ b/src/html.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util.h"
+#include "main.h"
+#include "message.h"
+#include "html.h"
+
+sig_atomic_t tags_sorted = 0;
+
+static struct html_tag tag_defs[] =
+{
+  /* W3C defined elements */
+  { Tag_A,          "a",          (CM_INLINE)},
+  { Tag_ABBR,       "abbr",       (CM_INLINE)},
+  { Tag_ACRONYM,    "acronym",    (CM_INLINE)},
+  { Tag_ADDRESS,    "address",    (CM_BLOCK)},
+  { Tag_APPLET,     "applet",     (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)},
+  { Tag_AREA,       "area",       (CM_BLOCK|CM_EMPTY)},
+  { Tag_B,          "b",          (CM_INLINE)},
+  { Tag_BASE,       "base",       (CM_HEAD|CM_EMPTY)},
+  { Tag_BASEFONT,   "basefont",   (CM_INLINE|CM_EMPTY)},
+  { Tag_BDO,        "bdo",        (CM_INLINE)},
+  { Tag_BIG,        "big",        (CM_INLINE)},
+  { Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)},
+  { Tag_BODY,       "body",       (CM_HTML|CM_OPT|CM_OMITST)},
+  { Tag_BR,         "br",         (CM_INLINE|CM_EMPTY)},
+  { Tag_BUTTON,     "button",     (CM_INLINE)},
+  { Tag_CAPTION,    "caption",    (CM_TABLE)},
+  { Tag_CENTER,     "center",     (CM_BLOCK)},
+  { Tag_CITE,       "cite",       (CM_INLINE)},
+  { Tag_CODE,       "code",       (CM_INLINE)},
+  { Tag_COL,        "col",        (CM_TABLE|CM_EMPTY)},
+  { Tag_COLGROUP,   "colgroup",   (CM_TABLE|CM_OPT)},
+  { Tag_DD,         "dd",         (CM_DEFLIST|CM_OPT|CM_NO_INDENT)},
+  { Tag_DEL,        "del",        (CM_INLINE|CM_BLOCK|CM_MIXED)},
+  { Tag_DFN,        "dfn",        (CM_INLINE)},
+  { Tag_DIR,        "dir",        (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_DIV,        "div",        (CM_BLOCK)},
+  { Tag_DL,         "dl",         (CM_BLOCK)},
+  { Tag_DT,         "dt",         (CM_DEFLIST|CM_OPT|CM_NO_INDENT)},
+  { Tag_EM,         "em",         (CM_INLINE)},
+  { Tag_FIELDSET,   "fieldset",   (CM_BLOCK)},
+  { Tag_FONT,       "font",       (CM_INLINE)},
+  { Tag_FORM,       "form",       (CM_BLOCK)},
+  { Tag_FRAME,      "frame",      (CM_FRAMES|CM_EMPTY)},
+  { Tag_FRAMESET,   "frameset",   (CM_HTML|CM_FRAMES)},
+  { Tag_H1,         "h1",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H2,         "h2",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H3,         "h3",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H4,         "h4",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H5,         "h5",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H6,         "h6",         (CM_BLOCK|CM_HEADING)},
+  { Tag_HEAD,       "head",       (CM_HTML|CM_OPT|CM_OMITST)},
+  { Tag_HR,         "hr",         (CM_BLOCK|CM_EMPTY)},
+  { Tag_HTML,       "html",       (CM_HTML|CM_OPT|CM_OMITST)},
+  { Tag_I,          "i",          (CM_INLINE)},
+  { Tag_IFRAME,     "iframe",     (CM_INLINE)},
+  { Tag_IMG,        "img",        (CM_INLINE|CM_IMG|CM_EMPTY)},
+  { Tag_INPUT,      "input",      (CM_INLINE|CM_IMG|CM_EMPTY)},
+  { Tag_INS,        "ins",        (CM_INLINE|CM_BLOCK|CM_MIXED)},
+  { Tag_ISINDEX,    "isindex",    (CM_BLOCK|CM_EMPTY)},
+  { Tag_KBD,        "kbd",        (CM_INLINE)},
+  { Tag_LABEL,      "label",      (CM_INLINE)},
+  { Tag_LEGEND,     "legend",     (CM_INLINE)},
+  { Tag_LI,         "li",         (CM_LIST|CM_OPT|CM_NO_INDENT)},
+  { Tag_LINK,       "link",       (CM_HEAD|CM_EMPTY)},
+  { Tag_LISTING,    "listing",    (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_MAP,        "map",        (CM_INLINE)},
+  { Tag_MENU,       "menu",       (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_META,       "meta",       (CM_HEAD|CM_EMPTY)},
+  { Tag_NOFRAMES,   "noframes",   (CM_BLOCK|CM_FRAMES)},
+  { Tag_NOSCRIPT,   "noscript",   (CM_BLOCK|CM_INLINE|CM_MIXED)},
+  { Tag_OBJECT,     "object",     (CM_OBJECT|CM_HEAD|CM_IMG|CM_INLINE|CM_PARAM)},
+  { Tag_OL,         "ol",         (CM_BLOCK)},
+  { Tag_OPTGROUP,   "optgroup",   (CM_FIELD|CM_OPT)},
+  { Tag_OPTION,     "option",     (CM_FIELD|CM_OPT)},
+  { Tag_P,          "p",          (CM_BLOCK|CM_OPT)},
+  { Tag_PARAM,      "param",      (CM_INLINE|CM_EMPTY)},
+  { Tag_PLAINTEXT,  "plaintext",  (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_PRE,        "pre",        (CM_BLOCK)},
+  { Tag_Q,          "q",          (CM_INLINE)},
+  { Tag_RB,         "rb",         (CM_INLINE)},
+  { Tag_RBC,        "rbc",        (CM_INLINE)},
+  { Tag_RP,         "rp",         (CM_INLINE)},
+  { Tag_RT,         "rt",         (CM_INLINE)},
+  { Tag_RTC,        "rtc",        (CM_INLINE)},
+  { Tag_RUBY,       "ruby",       (CM_INLINE)},
+  { Tag_S,          "s",          (CM_INLINE)},
+  { Tag_SAMP,       "samp",       (CM_INLINE)},
+  { Tag_SCRIPT,     "script",     (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)},
+  { Tag_SELECT,     "select",     (CM_INLINE|CM_FIELD)},
+  { Tag_SMALL,      "small",      (CM_INLINE)},
+  { Tag_SPAN,       "span",       (CM_INLINE)},
+  { Tag_STRIKE,     "strike",     (CM_INLINE)},
+  { Tag_STRONG,     "strong",     (CM_INLINE)},
+  { Tag_STYLE,      "style",      (CM_HEAD)},
+  { Tag_SUB,        "sub",        (CM_INLINE)},
+  { Tag_SUP,        "sup",        (CM_INLINE)},
+  { Tag_TABLE,      "table",      (CM_BLOCK)},
+  { Tag_TBODY,      "tbody",      (CM_TABLE|CM_ROWGRP|CM_OPT)},
+  { Tag_TD,         "td",         (CM_ROW|CM_OPT|CM_NO_INDENT)},
+  { Tag_TEXTAREA,   "textarea",   (CM_INLINE|CM_FIELD)},
+  { Tag_TFOOT,      "tfoot",      (CM_TABLE|CM_ROWGRP|CM_OPT)},
+  { Tag_TH,         "th",         (CM_ROW|CM_OPT|CM_NO_INDENT)},
+  { Tag_THEAD,      "thead",      (CM_TABLE|CM_ROWGRP|CM_OPT)},
+  { Tag_TITLE,      "title",      (CM_HEAD)},
+  { Tag_TR,         "tr",         (CM_TABLE|CM_OPT)},
+  { Tag_TT,         "tt",         (CM_INLINE)},
+  { Tag_U,          "u",          (CM_INLINE)},
+  { Tag_UL,         "ul",         (CM_BLOCK)},
+  { Tag_VAR,        "var",        (CM_INLINE)},
+  { Tag_XMP,        "xmp",        (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_NEXTID,     "nextid",     (CM_HEAD|CM_EMPTY)},
+
+  /* proprietary elements */
+  { Tag_ALIGN,      "align",      (CM_BLOCK)},
+  { Tag_BGSOUND,    "bgsound",    (CM_HEAD|CM_EMPTY)},
+  { Tag_BLINK,      "blink",      (CM_INLINE)},
+  { Tag_COMMENT,    "comment",    (CM_INLINE)},
+  { Tag_EMBED,      "embed",      (CM_INLINE|CM_IMG|CM_EMPTY)},
+  { Tag_ILAYER,     "ilayer",     (CM_INLINE)},
+  { Tag_KEYGEN,     "keygen",     (CM_INLINE|CM_EMPTY)},
+  { Tag_LAYER,      "layer",      (CM_BLOCK)},
+  { Tag_MARQUEE,    "marquee",    (CM_INLINE|CM_OPT)},
+  { Tag_MULTICOL,   "multicol",   (CM_BLOCK)},
+  { Tag_NOBR,       "nobr",       (CM_INLINE)},
+  { Tag_NOEMBED,    "noembed",    (CM_INLINE)},
+  { Tag_NOLAYER,    "nolayer",    (CM_BLOCK|CM_INLINE|CM_MIXED)},
+  { Tag_NOSAVE,     "nosave",     (CM_BLOCK)},
+  { Tag_SERVER,     "server",     (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)},
+  { Tag_SERVLET,    "servlet",    (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)},
+  { Tag_SPACER,     "spacer",     (CM_INLINE|CM_EMPTY)},
+  { Tag_WBR,        "wbr",        (CM_INLINE|CM_EMPTY)},
+};
+
+static int
+tag_cmp (const void *m1, const void *m2)
+{
+	const struct html_tag *p1 = m1;
+	const struct html_tag *p2 = m2;
+
+	return g_ascii_strcasecmp (p1->name, p2->name);
+}
+
+static GNode* 
+construct_html_node (memory_pool_t *pool, char *text)
+{
+	struct html_node *html;
+	GNode *n = NULL;
+	struct html_tag key, *found;
+	char t;
+	int taglen = strlen (text);
+
+	if (text == NULL || *text == '\0') {
+		return NULL;
+	}
+	
+	html = memory_pool_alloc0 (pool, sizeof (struct html_node));
+
+	/* Check whether this tag is fully closed */
+	if (*(text + taglen - 1) == '/') {
+		html->flags |= FL_CLOSED;
+	}
+
+	/* Check xml tag */
+	if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) {
+			html->flags |= FL_XML;
+			html->tag = NULL;
+	}
+	else {
+		if (*text == '/') {
+			html->flags |= FL_CLOSING;
+			text ++;
+		}
+
+		/* Find end of tag name */
+		key.name = text;
+		while (*text && g_ascii_isalnum (*(++text)));
+
+		t = *text;
+		*text = '\0';
+
+		/* Match tag id by tag name */
+		if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) {
+			*text = t;
+			html->tag = found;
+		}
+		else {
+			*text = t;
+			return NULL;
+		}
+	}
+
+	n = g_node_new (html);
+
+	return n;
+}
+
+static gboolean
+check_balance (GNode *node, GNode **cur_level)
+{
+	struct html_node *arg = node->data, *tmp;
+	GNode *cur;
+	
+	if (arg->flags & FL_CLOSING) {
+		/* First of all check whether this tag is closing tag for parent node */
+		cur = node->parent;
+		while (cur && cur->data) {
+			tmp = cur->data;
+			if (tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) {
+				msg_debug ("check_balance: found closing tag for parent '%s'", tmp->tag->name);
+				tmp->flags |= FL_CLOSED;
+				/* Destroy current node as we find corresponding parent node */
+				g_node_destroy (node);
+				/* Change level */
+				*cur_level = cur->parent;
+				return TRUE;
+			}
+			cur = cur->parent;
+		}
+	}
+	else {
+		return TRUE;
+	}
+	
+	msg_debug ("check_balance: found unbalanced tag %s", arg->tag->name);
+	return FALSE;
+}
+
+struct html_tag * 
+get_tag_by_name (const char *name)
+{
+	struct html_tag key;
+
+	key.name = name;
+
+	return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+}
+
+gboolean
+add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
+{
+	GNode *new;
+	struct html_node *data;
+
+	if (!tags_sorted) {
+		qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+		tags_sorted = 1;
+	}
+
+	/* First call of this function */
+	if (part->html_nodes == NULL) {
+		/* Insert root node */
+		new = g_node_new (NULL);
+		*cur_level = new;
+		part->html_nodes = new;
+		memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes);
+		/* Call once again with root node */
+		return add_html_node (pool, part, tag_text, cur_level);
+	}
+	else {
+		new = construct_html_node (pool, tag_text);
+		if (new == NULL) {
+			msg_debug ("add_html_node: cannot construct HTML node for text '%s'", tag_text);
+			return -1;
+		}
+		data = new->data;
+		if (data->flags & FL_CLOSING) {
+			if (! *cur_level) {
+				msg_debug ("add_html_node: bad parent node");
+				return FALSE;
+			}
+			g_node_append (*cur_level, new);
+			if (!check_balance (new, cur_level)) {
+				msg_debug ("add_html_node: mark part as unbalanced as it has not pairable closing tags");
+				part->is_balanced = FALSE;
+			}
+		}
+		else {
+			g_node_append (*cur_level, new);
+			if ((data->flags & FL_CLOSED) == 0) {
+				msg_debug ("add_html_node: append opening tag: '%s'", data->tag->name);
+				*cur_level = new;
+			}
+			else {
+				msg_debug ("add_html_node: append closed tag: '%s'", data->tag->name);
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/html.h b/src/html.h
new file mode 100644
index 000000000..70f20de49
--- /dev/null
+++ b/src/html.h
@@ -0,0 +1,210 @@
+/*
+ * Functions for simple html parsing
+ */
+
+#ifndef RSPAMD_HTML_H
+#define RSPAMD_HTML_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+/* Known HTML tags */
+typedef enum
+{
+  Tag_UNKNOWN,  /**< Unknown tag! */
+  Tag_A,        /**< A */
+  Tag_ABBR,     /**< ABBR */
+  Tag_ACRONYM,  /**< ACRONYM */
+  Tag_ADDRESS,  /**< ADDRESS */
+  Tag_ALIGN,    /**< ALIGN */
+  Tag_APPLET,   /**< APPLET */
+  Tag_AREA,     /**< AREA */
+  Tag_B,        /**< B */
+  Tag_BASE,     /**< BASE */
+  Tag_BASEFONT, /**< BASEFONT */
+  Tag_BDO,      /**< BDO */
+  Tag_BGSOUND,  /**< BGSOUND */
+  Tag_BIG,      /**< BIG */
+  Tag_BLINK,    /**< BLINK */
+  Tag_BLOCKQUOTE,   /**< BLOCKQUOTE */
+  Tag_BODY,     /**< BODY */
+  Tag_BR,       /**< BR */
+  Tag_BUTTON,   /**< BUTTON */
+  Tag_CAPTION,  /**< CAPTION */
+  Tag_CENTER,   /**< CENTER */
+  Tag_CITE,     /**< CITE */
+  Tag_CODE,     /**< CODE */
+  Tag_COL,      /**< COL */
+  Tag_COLGROUP, /**< COLGROUP */
+  Tag_COMMENT,  /**< COMMENT */
+  Tag_DD,       /**< DD */
+  Tag_DEL,      /**< DEL */
+  Tag_DFN,      /**< DFN */
+  Tag_DIR,      /**< DIR */
+  Tag_DIV,      /**< DIF */
+  Tag_DL,       /**< DL */
+  Tag_DT,       /**< DT */
+  Tag_EM,       /**< EM */
+  Tag_EMBED,    /**< EMBED */
+  Tag_FIELDSET, /**< FIELDSET */
+  Tag_FONT,     /**< FONT */
+  Tag_FORM,     /**< FORM */
+  Tag_FRAME,    /**< FRAME */
+  Tag_FRAMESET, /**< FRAMESET */
+  Tag_H1,       /**< H1 */
+  Tag_H2,       /**< H2 */
+  Tag_H3,       /**< H3 */
+  Tag_H4,       /**< H4 */
+  Tag_H5,       /**< H5 */
+  Tag_H6,       /**< H6 */
+  Tag_HEAD,     /**< HEAD */
+  Tag_HR,       /**< HR */
+  Tag_HTML,     /**< HTML */
+  Tag_I,        /**< I */
+  Tag_IFRAME,   /**< IFRAME */
+  Tag_ILAYER,   /**< ILAYER */
+  Tag_IMG,      /**< IMG */
+  Tag_INPUT,    /**< INPUT */
+  Tag_INS,      /**< INS */
+  Tag_ISINDEX,  /**< ISINDEX */
+  Tag_KBD,      /**< KBD */
+  Tag_KEYGEN,   /**< KEYGEN */
+  Tag_LABEL,    /**< LABEL */
+  Tag_LAYER,    /**< LAYER */
+  Tag_LEGEND,   /**< LEGEND */
+  Tag_LI,       /**< LI */
+  Tag_LINK,     /**< LINK */
+  Tag_LISTING,  /**< LISTING */
+  Tag_MAP,      /**< MAP */
+  Tag_MARQUEE,  /**< MARQUEE */
+  Tag_MENU,     /**< MENU */
+  Tag_META,     /**< META */
+  Tag_MULTICOL, /**< MULTICOL */
+  Tag_NOBR,     /**< NOBR */
+  Tag_NOEMBED,  /**< NOEMBED */
+  Tag_NOFRAMES, /**< NOFRAMES */
+  Tag_NOLAYER,  /**< NOLAYER */
+  Tag_NOSAVE,   /**< NOSAVE */
+  Tag_NOSCRIPT, /**< NOSCRIPT */
+  Tag_OBJECT,   /**< OBJECT */
+  Tag_OL,       /**< OL */
+  Tag_OPTGROUP, /**< OPTGROUP */
+  Tag_OPTION,   /**< OPTION */
+  Tag_P,        /**< P */
+  Tag_PARAM,    /**< PARAM */
+  Tag_PLAINTEXT,/**< PLAINTEXT */
+  Tag_PRE,      /**< PRE */
+  Tag_Q,        /**< Q */
+  Tag_RB,       /**< RB */
+  Tag_RBC,      /**< RBC */
+  Tag_RP,       /**< RP */
+  Tag_RT,       /**< RT */
+  Tag_RTC,      /**< RTC */
+  Tag_RUBY,     /**< RUBY */
+  Tag_S,        /**< S */
+  Tag_SAMP,     /**< SAMP */
+  Tag_SCRIPT,   /**< SCRIPT */
+  Tag_SELECT,   /**< SELECT */
+  Tag_SERVER,   /**< SERVER */
+  Tag_SERVLET,  /**< SERVLET */
+  Tag_SMALL,    /**< SMALL */
+  Tag_SPACER,   /**< SPACER */
+  Tag_SPAN,     /**< SPAN */
+  Tag_STRIKE,   /**< STRIKE */
+  Tag_STRONG,   /**< STRONG */
+  Tag_STYLE,    /**< STYLE */
+  Tag_SUB,      /**< SUB */
+  Tag_SUP,      /**< SUP */
+  Tag_TABLE,    /**< TABLE */
+  Tag_TBODY,    /**< TBODY */
+  Tag_TD,       /**< TD */
+  Tag_TEXTAREA, /**< TEXTAREA */
+  Tag_TFOOT,    /**< TFOOT */
+  Tag_TH,       /**< TH */
+  Tag_THEAD,    /**< THEAD */
+  Tag_TITLE,    /**< TITLE */
+  Tag_TR,       /**< TR */
+  Tag_TT,       /**< TT */
+  Tag_U,        /**< U */
+  Tag_UL,       /**< UL */
+  Tag_VAR,      /**< VAR */
+  Tag_WBR,      /**< WBR */
+  Tag_XMP,      /**< XMP */
+  Tag_XML,		/**< XML */
+  Tag_NEXTID,   /**< NEXTID */
+
+  N_TAGS        /**< Must be last */
+} tag_id_t;
+
+#define CM_UNKNOWN      0
+/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY        (1 << 0)
+/* Elements that appear outside of "BODY". */
+#define CM_HTML         (1 << 1)
+/* Elements that can appear within HEAD. */
+#define CM_HEAD         (1 << 2)
+/* HTML "block" elements. */
+#define CM_BLOCK        (1 << 3)
+/* HTML "inline" elements. */
+#define CM_INLINE       (1 << 4)
+/* Elements that mark list item ("LI"). */
+#define CM_LIST         (1 << 5)
+/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_DEFLIST      (1 << 6)
+/* Elements that can appear inside TABLE. */
+#define CM_TABLE        (1 << 7)
+/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROWGRP       (1 << 8)
+/* Used for "TD", "TH" */
+#define CM_ROW          (1 << 9)
+/* Elements whose content must be protected against white space movement.
+   Includes some elements that can found in forms. */
+#define CM_FIELD        (1 << 10)
+/* Used to avoid propagating inline emphasis inside some elements
+   such as OBJECT or APPLET. */
+#define CM_OBJECT       (1 << 11)
+/* Elements that allows "PARAM". */
+#define CM_PARAM        (1 << 12)
+/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
+#define CM_FRAMES       (1 << 13)
+/* Heading elements (h1, h2, ...). */
+#define CM_HEADING      (1 << 14)
+/* Elements with an optional end tag. */
+#define CM_OPT          (1 << 15)
+/* Elements that use "align" attribute for vertical position. */
+#define CM_IMG          (1 << 16)
+/* Elements with inline and block model. Used to avoid calling InlineDup. */
+#define CM_MIXED        (1 << 17)
+/* Elements whose content needs to be indented only if containing one 
+   CM_BLOCK element. */
+#define CM_NO_INDENT    (1 << 18)
+/* Elements that are obsolete (such as "dir", "menu"). */
+#define CM_OBSOLETE     (1 << 19)
+/* User defined elements. Used to determine how attributes wihout value
+   should be printed. */
+#define CM_NEW          (1 << 20)
+/* Elements that cannot be omitted. */
+#define CM_OMITST       (1 << 21)
+
+/* XML tag */
+#define FL_XML			(1 << 0)
+/* Closing tag */
+#define FL_CLOSING		(1 << 1)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED		(1 << 2)
+
+struct html_tag {
+	tag_id_t id;
+	const char *name;
+	int flags;
+};
+
+struct html_node {
+	struct html_tag *tag;
+	int flags;
+};
+
+gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
+struct html_tag * get_tag_by_name (const char *name);
+
+#endif
diff --git a/src/message.c b/src/message.c
index 5d344db62..3024377d5 100644
--- a/src/message.c
+++ b/src/message.c
@@ -27,15 +27,17 @@
 #include "main.h"
 #include "message.h"
 #include "cfg_file.h"
+#include "html.h"
 #include "modules.h"
 
 GByteArray*
-strip_html_tags (GByteArray *src, int *stateptr)
+strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
 {
-	uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc;
+	uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin, c, lc;
 	int br, i = 0, depth = 0, in_q = 0;
 	int state = 0;
 	GByteArray *buf;
+	GNode *level_ptr = NULL;
 
 	if (stateptr)
 		state = *stateptr;
@@ -59,6 +61,7 @@ strip_html_tags (GByteArray *src, int *stateptr)
 				}
 				if (state == 0) {
 					lc = '<';
+					tbegin = p + 1;
 					state = 1;
 				} else if (state == 1) {
 					depth++;
@@ -101,7 +104,9 @@ strip_html_tags (GByteArray *src, int *stateptr)
 					case 1: /* HTML/XML */
 						lc = '>';
 						in_q = state = 0;
-						
+						*p = '\0';
+						add_html_node (pool, part, tbegin, &level_ptr);
+						*p = '>';
 						break;
 						
 					case 2: /* PHP */
@@ -220,9 +225,15 @@ reg_char:
 		*rp = '\0';
 		g_byte_array_set_size (buf, rp - buf->data);
 	}
+	
+	/* Check tag balancing */
+	if (level_ptr && level_ptr->data != NULL) {
+			part->is_balanced = FALSE;
+	}
 
-	if (stateptr)
+	if (stateptr) {
 		*stateptr = state;
+	}
 
 	return buf;
 }
@@ -287,8 +298,10 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
 		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
-		text_part->content = strip_html_tags (part_content, NULL);
 		text_part->is_html = TRUE;
+		text_part->is_balanced = TRUE;
+		text_part->html_nodes = NULL;
+		text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
 		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
 		task->text_parts = g_list_prepend (task->text_parts, text_part);
@@ -591,10 +604,12 @@ process_learn (struct controller_session *session)
 	return 0;
 }
 
+/*
+ * XXX: remove this function for learning
+ */
 GByteArray* 
 get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
 {
-	GByteArray *ret = NULL;
 	struct mime_part *p;
 
 	if (*cur == NULL) {
@@ -611,6 +626,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
 			msg_debug ("get_next_text_part: text/plain part");
 			return p->content;
 		}
+#if 0
 		else if (g_mime_content_type_is_type (p->type, "text", "html")) {
 			msg_debug ("get_next_text_part: try to strip html tags");
 			ret = strip_html_tags (p->content, NULL);
@@ -623,6 +639,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
 			memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
 			return ret;
 		}
+#endif
 		*cur = g_list_next (*cur);
 	}
 	
diff --git a/src/message.h b/src/message.h
index 9e9b5de1f..72711638f 100644
--- a/src/message.h
+++ b/src/message.h
@@ -17,8 +17,10 @@ struct mime_part {
 struct mime_text_part {
 	gboolean is_html;
 	gboolean is_raw;
+	gboolean is_balanced;
 	GByteArray *orig;
 	GByteArray *content;
+	GNode *html_nodes;
 	fuzzy_hash_t *fuzzy;
 };