From 500ddb601ca945959149e5a8b5b089151e7b338f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 17 Jul 2015 13:51:19 +0100 Subject: [PATCH] Improve tag_exists function. --- src/libmime/mime_expressions.c | 42 +--------------------------------- src/libserver/html.c | 27 ++++++++++++++++++++-- src/libserver/html.h | 11 +++++---- 3 files changed, 32 insertions(+), 48 deletions(-) diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index c367ad073..bff70c1b7 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -1550,37 +1550,13 @@ rspamd_is_html_balanced (struct rspamd_task * task, GArray * args, void *unused) } -struct html_callback_data { - struct html_tag *tag; - gboolean *res; -}; - -static gboolean -search_html_node_callback (GNode * node, gpointer data) -{ - struct html_callback_data *cd = data; - struct html_tag *nd; - - nd = node->data; - if (nd) { - if (nd->id == cd->tag->id) { - *cd->res = TRUE; - return TRUE; - } - } - - return FALSE; -} - gboolean rspamd_has_html_tag (struct rspamd_task * task, GArray * args, void *unused) { struct mime_text_part *p; struct expression_argument *arg; - struct html_tag *tag; guint i; gboolean res = FALSE; - struct html_callback_data cd; if (args == NULL) { msg_warn ("no parameters to function"); @@ -1593,27 +1569,11 @@ rspamd_has_html_tag (struct rspamd_task * task, GArray * args, void *unused) return FALSE; } - tag = get_tag_by_name (arg->data); - if (tag == NULL) { - msg_warn ("unknown tag type passed as argument: %s", - (gchar *)arg->data); - return FALSE; - } - - cd.res = &res; - cd.tag = tag; - for (i = 0; i < task->text_parts->len && res; i ++) { p = g_ptr_array_index (task->text_parts, i); if (!IS_PART_EMPTY (p) && IS_PART_HTML (p) && p->html) { - /* TODO: too slow */ - g_node_traverse (p->html->html_tags, - G_PRE_ORDER, - G_TRAVERSE_ALL, - -1, - search_html_node_callback, - &cd); + res = rspamd_html_tag_seen (p->html, arg->data); } } diff --git a/src/libserver/html.c b/src/libserver/html.c index 421a89829..cfab7a7d7 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -714,9 +714,26 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level) return FALSE; } -struct html_tag * -get_tag_by_name (const gchar *name) +gboolean +rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname) { + struct html_tag tag; + struct html_tag_def *found; + + g_assert (hc != NULL); + g_assert (hc->tags_seen != NULL); + + tag.name.start = tagname; + tag.name.len = strlen (tagname); + + found = bsearch (&tag, tag_defs, G_N_ELEMENTS (tag_defs), + sizeof (tag_defs[0]), tag_find); + + if (found) { + return isset (hc->tags_seen, found->id); + } + + return FALSE; } /* Decode HTML entitles in text */ @@ -1291,6 +1308,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, entities_sorted = 1; } + hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (G_N_ELEMENTS (tag_defs))); + dest = g_byte_array_sized_new (in->len / 3 * 2); p = in->data; @@ -1553,6 +1572,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, state = content_ignore; } + if (cur_tag->id != -1 && cur_tag->id < N_TAGS) { + setbit (hc->tags_seen, cur_tag->id); + } + if ((cur_tag->id == Tag_P || cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) && balanced) { /* Insert newline */ diff --git a/src/libserver/html.h b/src/libserver/html.h index c70d7d6ed..4b17b5000 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -41,13 +41,9 @@ struct rspamd_task; struct html_content { GNode *html_tags; gint flags; + guchar *tags_seen; }; -/* - * Get tag structure by its name (binary search is used) - */ -struct html_tag * get_tag_by_name (const gchar *name); - /* * Decode HTML entitles in text. Text is modified in place. */ @@ -61,4 +57,9 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails); +/* + * Returns true if a specified tag has been seen in a part + */ +gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname); + #endif -- 2.39.5