From: Vsevolod Stakhov Date: Mon, 6 Apr 2015 17:03:49 +0000 (+0100) Subject: Use new ac_trie for url extraction. X-Git-Tag: 0.9.0~318^2~3 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=d6724d926dcbc2fd204f989c1b71b6a0cc52e02a;p=rspamd.git Use new ac_trie for url extraction. --- diff --git a/contrib/aho-corasic/acism.c b/contrib/aho-corasic/acism.c index 23a5eb334..a4c678154 100644 --- a/contrib/aho-corasic/acism.c +++ b/contrib/aho-corasic/acism.c @@ -24,11 +24,11 @@ int acism_lookup(ac_trie_t const *psp, const char *text, size_t len, - ACISM_ACTION *cb, void *context) + ACISM_ACTION *cb, void *context, int *statep) { ac_trie_t const ps = *psp; char const *cp = text, *endp = cp + len; - STATE state = 0; + STATE state = *statep; int ret = 0; while (cp < endp) { @@ -102,6 +102,18 @@ acism_lookup(ac_trie_t const *psp, const char *text, size_t len, } } EXIT: + *statep = state; return ret; } + +void +acism_destroy(ac_trie_t *psp) +{ + if (!psp) return; + if (psp->flags & IS_MMAP) + munmap((char*)psp->tranv - sizeof(ac_trie_t), + sizeof(ac_trie_t) + p_size(psp)); + else free(psp->tranv); + free(psp); +} //EOF diff --git a/contrib/aho-corasic/acism.h b/contrib/aho-corasic/acism.h index af6f60253..3886b149e 100644 --- a/contrib/aho-corasic/acism.h +++ b/contrib/aho-corasic/acism.h @@ -46,6 +46,6 @@ typedef int (ACISM_ACTION)(int strnum, int textpos, void *context); // *state should initially be (0). int acism_lookup(ac_trie_t const *psp, const char *text, size_t len, - ACISM_ACTION *cb, void *context); + ACISM_ACTION *cb, void *context, int *statep); #endif//ACISM_H diff --git a/src/libmime/message.c b/src/libmime/message.c index cbb2d8d31..b94d2fb19 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1517,10 +1517,11 @@ process_message (struct rspamd_task *task) GMimePart *part; GMimeDataWrapper *wrapper; struct received_header *recv; - gchar *mid, *url_str, *p, *end, *url_end; + gchar *mid, *url_str; + const gchar *url_end, *p, *end; struct rspamd_url *subject_url; gsize len; - gint rc; + gint rc, state = 0; tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); tmp->data = (guint8 *)task->msg.start; @@ -1708,7 +1709,7 @@ process_message (struct rspamd_task *task) while (p < end) { /* Search to the end of url */ if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end, - &url_str, FALSE)) { + &url_str, FALSE, &state)) { if (url_str != NULL) { subject_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); diff --git a/src/libserver/html.c b/src/libserver/html.c index f0d200e88..6ff4f4bae 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -682,7 +682,7 @@ check_phishing (struct rspamd_task *task, gchar tagbuf[128]; struct html_tag *tag; gsize len = 0; - gint rc; + gint rc, state = 0; p = url_text; while (len < remain) { @@ -730,7 +730,7 @@ check_phishing (struct rspamd_task *task, } if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str, - TRUE) && url_str != NULL) { + TRUE, &state) && url_str != NULL) { new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); g_strstrip (url_str); rc = rspamd_url_parse (new, url_str, strlen (url_str), task->task_pool); diff --git a/src/libserver/url.c b/src/libserver/url.c index 116255e4e..43ce4c68f 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -32,6 +32,7 @@ #include "message.h" #include "trie.h" #include "http.h" +#include "acism.h" typedef struct url_match_s { const gchar *m_begin; @@ -673,8 +674,8 @@ struct url_matcher static_matchers[] = { struct url_match_scanner { GArray *matchers; - rspamd_trie_t *search_trie; - rspamd_trie_t *tld_trie; + GArray *patterns; + ac_trie_t *search_trie; }; struct url_match_scanner *url_scanner = NULL; @@ -827,6 +828,7 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner { FILE *f; struct url_matcher m; + ac_trie_pat_t pat; gchar *linebuf = NULL, *p; gsize buflen = 0, patlen; gssize r; @@ -876,8 +878,11 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner patlen = strlen (p); m.pattern = g_malloc (patlen + 2); m.pattern[0] = '.'; + pat.ptr = m.pattern; + pat.len = patlen + 1; rspamd_strlcpy (&m.pattern[1], p, patlen + 1); g_array_append_val (url_scanner->matchers, m); + g_array_append_val (url_scanner->patterns, pat); } free (linebuf); @@ -885,27 +890,30 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner } static void -rspamd_url_add_static_matchers (GArray *matchers) +rspamd_url_add_static_matchers (struct url_match_scanner *sc) { - gint n = G_N_ELEMENTS (static_matchers); + gint n = G_N_ELEMENTS (static_matchers), i; + ac_trie_pat_t pat; - g_array_append_vals (matchers, static_matchers, n); + g_array_append_vals (sc->matchers, static_matchers, n); + + for (i = 0; i < n; i ++) { + pat.ptr = static_matchers[i].pattern; + pat.len = strlen (pat.ptr); + g_array_append_val (sc->patterns, pat); + } } void rspamd_url_init (const gchar *tld_file) { - guint i; - gchar patbuf[128]; - struct url_matcher *m; - if (url_scanner == NULL) { url_scanner = g_malloc (sizeof (struct url_match_scanner)); - url_scanner->matchers = g_array_new (FALSE, TRUE, - sizeof (struct url_matcher)); - url_scanner->search_trie = rspamd_trie_create (TRUE); - url_scanner->tld_trie = rspamd_trie_create (TRUE); - rspamd_url_add_static_matchers (url_scanner->matchers); + url_scanner->matchers = g_array_sized_new (FALSE, TRUE, + sizeof (struct url_matcher), 512); + url_scanner->patterns = g_array_sized_new (FALSE, TRUE, + sizeof (ac_trie_pat_t), 512); + rspamd_url_add_static_matchers (url_scanner); if (tld_file != NULL) { rspamd_url_parse_tld_file (tld_file, url_scanner); @@ -914,16 +922,11 @@ rspamd_url_init (const gchar *tld_file) msg_warn ("tld extension file is not specified, url matching is limited"); } - for (i = 0; i < url_scanner->matchers->len; i++) { - m = &g_array_index (url_scanner->matchers, struct url_matcher, i); - - rspamd_trie_insert (url_scanner->search_trie, m->pattern, i); + url_scanner->search_trie = acism_create ( + (const ac_trie_pat_t *)url_scanner->patterns->data, + url_scanner->patterns->len); - /* Also use it for TLD lookups */ - if (strcmp (m->prefix, "http://") == 0) { - rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i); - } - } + msg_info ("initialized ac_trie of %ud elements", url_scanner->patterns->len); } } @@ -1822,12 +1825,11 @@ rspamd_url_text_extract (rspamd_mempool_t * pool, struct mime_text_part *part, gboolean is_html) { - gint rc; - gchar *url_str = NULL, *url_start, *url_end; + gint rc, state = 0; + gchar *url_str = NULL; struct rspamd_url *new; struct process_exception *ex; - gchar *p, *end, *begin; - + const gchar *p, *end, *begin, *url_start, *url_end; if (part->content == NULL || part->content->len == 0) { msg_warn ("got empty text part"); @@ -1839,7 +1841,7 @@ rspamd_url_text_extract (rspamd_mempool_t * pool, p = begin; while (p < end) { if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, - is_html)) { + is_html, &state)) { if (url_str != NULL) { new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); ex = @@ -1889,67 +1891,97 @@ rspamd_url_text_extract (rspamd_mempool_t * pool, } } -gboolean -rspamd_url_find (rspamd_mempool_t *pool, - const gchar *begin, - gsize len, - gchar **start, - gchar **fin, - gchar **url_str, - gboolean is_html) +struct url_callback_data { + const gchar *begin; + gchar *url_str; + rspamd_mempool_t *pool; + gint len; + gboolean is_html; + const gchar *start; + const gchar *fin; + const gchar *end; +}; + +static gint +rspamd_url_trie_callback (int strnum, int textpos, void *context) { - const gchar *end, *pos; - gint idx, l; struct url_matcher *matcher; url_match_t m; + const gchar *pos; + struct url_callback_data *cb = context; - end = begin + len; - if ((pos = - rspamd_trie_lookup (url_scanner->search_trie, begin, len, - &idx)) == NULL) { - return FALSE; + matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum); + if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) { + /* Do not try to match non-html like urls in html texts */ + return 0; + } + + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + m.add_prefix = FALSE; + pos = cb->begin + textpos; + + if (matcher->start (cb->begin, cb->end, pos, + &m) && matcher->end (cb->begin, cb->end, pos, &m)) { + if (m.add_prefix || matcher->prefix[0] != '\0') { + cb->len = m.m_len + strlen (m.prefix); + cb->url_str = rspamd_mempool_alloc (cb->pool, cb->len + 1); + rspamd_snprintf (cb->url_str, + cb->len, + "%s%*s", + m.prefix, + m.m_len, + m.m_begin); + } + else { + cb->url_str = rspamd_mempool_alloc (cb->pool, m.m_len + 1); + rspamd_strlcpy (cb->url_str, m.m_begin, m.m_len + 1); + } + + cb->start = (gchar *)m.m_begin; + cb->fin = (gchar *)m.m_begin + m.m_len; + + return 1; } else { - matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx); - if ((matcher->flags & URL_FLAG_NOHTML) && is_html) { - /* Do not try to match non-html like urls in html texts */ - return FALSE; + cb->url_str = NULL; + } + + /* Continue search */ + return 0; +} + +gboolean +rspamd_url_find (rspamd_mempool_t *pool, + const gchar *begin, + gsize len, + const gchar **start, + const gchar **fin, + gchar **url_str, + gboolean is_html, + gint *statep) +{ + struct url_callback_data cb; + gint ret; + + g_assert (statep != NULL); + memset (&cb, 0, sizeof (cb)); + cb.begin = begin; + cb.end = begin + len; + cb.is_html = is_html; + cb.pool = pool; + ret = acism_lookup (url_scanner->search_trie, begin, len, + rspamd_url_trie_callback, &cb, statep); + + if (ret) { + if (start) { + *start = cb.start; } - m.pattern = matcher->pattern; - m.prefix = matcher->prefix; - m.add_prefix = FALSE; - if (matcher->start (begin, end, pos, - &m) && matcher->end (begin, end, pos, &m)) { - if (m.add_prefix || matcher->prefix[0] != '\0') { - l = m.m_len + 1 + strlen (m.prefix); - *url_str = rspamd_mempool_alloc (pool, l); - rspamd_snprintf (*url_str, - l, - "%s%*s", - m.prefix, - m.m_len, - m.m_begin); - } - else { - *url_str = rspamd_mempool_alloc (pool, m.m_len + 1); - memcpy (*url_str, m.m_begin, m.m_len); - (*url_str)[m.m_len] = '\0'; - } - if (start != NULL) { - *start = (gchar *)m.m_begin; - } - if (fin != NULL) { - *fin = (gchar *)m.m_begin + m.m_len; - } + if (fin) { + *fin = cb.fin; } - else { - *url_str = NULL; - if (start != NULL) { - *start = (gchar *)pos; - } - if (fin != NULL) { - *fin = (gchar *)pos + strlen (m.prefix); - } + if (url_str) { + *url_str = cb.url_str; } return TRUE; @@ -1960,10 +1992,10 @@ rspamd_url_find (rspamd_mempool_t *pool, struct rspamd_url * rspamd_url_get_next (rspamd_mempool_t *pool, - const gchar *start, gchar const **pos) + const gchar *start, gchar const **pos, gint *statep) { - const gchar *p, *end; - gchar *url_str = NULL, *url_start, *url_end; + const gchar *p, *end, *url_start, *url_end; + gchar *url_str = NULL; struct rspamd_url *new; gint rc; @@ -1978,7 +2010,7 @@ rspamd_url_get_next (rspamd_mempool_t *pool, if (p < end) { if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, - FALSE)) { + FALSE, statep)) { if (url_str != NULL) { new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); diff --git a/src/libserver/url.h b/src/libserver/url.h index 8dc2a7032..1c76294cd 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -104,11 +104,11 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri, gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, - gchar **start, - gchar **end, + const gchar **start, + const gchar **end, gchar **url_str, - gboolean is_html); - + gboolean is_html, + gint *statep); /* * Return text representation of url parsing error */ @@ -123,6 +123,6 @@ const gchar * rspamd_url_strerror (enum uri_errno err); */ struct rspamd_url * rspamd_url_get_next (rspamd_mempool_t *pool, - const gchar *start, gchar const **pos); + const gchar *start, gchar const **pos, gint *statep); #endif