]> source.dussan.org Git - rspamd.git/commitdiff
Use new ac_trie for url extraction.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 6 Apr 2015 17:03:49 +0000 (18:03 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 6 Apr 2015 17:03:49 +0000 (18:03 +0100)
contrib/aho-corasic/acism.c
contrib/aho-corasic/acism.h
src/libmime/message.c
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h

index 23a5eb334c45e62b5bd18cca9f2de8a508407d13..a4c67815445437acbf2c9a0ce1db7c631e518272 100644 (file)
 
 int
 acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
-           ACISM_ACTION *cb, void *context)
+           ACISM_ACTION *cb, void *context, int *statep)
 {
     ac_trie_t const ps = *psp;
     char const *cp = text, *endp = cp + len;
-    STATE state = 0;
+    STATE state = *statep;
     int ret = 0;
 
     while (cp < endp) {
@@ -102,6 +102,18 @@ acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
         }
     }
 EXIT:
+       *statep = state;
     return ret;
 }
+
+void
+acism_destroy(ac_trie_t *psp)
+{
+       if (!psp) return;
+       if (psp->flags & IS_MMAP)
+               munmap((char*)psp->tranv - sizeof(ac_trie_t),
+                               sizeof(ac_trie_t) + p_size(psp));
+       else free(psp->tranv);
+       free(psp);
+}
 //EOF
index af6f602535c55ebdb39937fd6295c05e29a6de55..3886b149ee41be974a4cd2d32d62aff6bd401a42 100644 (file)
@@ -46,6 +46,6 @@ typedef int (ACISM_ACTION)(int strnum, int textpos, void *context);
 // *state should initially be (0).
 
 int acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
-           ACISM_ACTION *cb, void *context);
+           ACISM_ACTION *cb, void *context, int *statep);
 
 #endif//ACISM_H
index cbb2d8d31387ff528b61c29f93e526dd3cea5b71..b94d2fb19e3109faa136c06a538cd6c0b22bf4ff 100644 (file)
@@ -1517,10 +1517,11 @@ process_message (struct rspamd_task *task)
        GMimePart *part;
        GMimeDataWrapper *wrapper;
        struct received_header *recv;
-       gchar *mid, *url_str, *p, *end, *url_end;
+       gchar *mid, *url_str;
+       const gchar *url_end, *p, *end;
        struct rspamd_url *subject_url;
        gsize len;
-       gint rc;
+       gint rc, state = 0;
 
        tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
        tmp->data = (guint8 *)task->msg.start;
@@ -1708,7 +1709,7 @@ process_message (struct rspamd_task *task)
                while (p < end) {
                        /* Search to the end of url */
                        if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
-                               &url_str, FALSE)) {
+                               &url_str, FALSE, &state)) {
                                if (url_str != NULL) {
                                        subject_url = rspamd_mempool_alloc0 (task->task_pool,
                                                        sizeof (struct rspamd_url));
index f0d200e881d6c30475e50417fc79834b0bb0dacf..6ff4f4bae169009fe24e6b0193b9e6876b7c84bf 100644 (file)
@@ -682,7 +682,7 @@ check_phishing (struct rspamd_task *task,
        gchar tagbuf[128];
        struct html_tag *tag;
        gsize len = 0;
-       gint rc;
+       gint rc, state = 0;
 
        p = url_text;
        while (len < remain) {
@@ -730,7 +730,7 @@ check_phishing (struct rspamd_task *task,
        }
 
        if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
-               TRUE) && url_str != NULL) {
+               TRUE, &state) && url_str != NULL) {
                new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
                g_strstrip (url_str);
                rc = rspamd_url_parse (new, url_str, strlen (url_str), task->task_pool);
index 116255e4e89884dc7dd0800746d7b23710087a01..43ce4c68fdcb01b7432692db378a6cb97fd13a81 100644 (file)
@@ -32,6 +32,7 @@
 #include "message.h"
 #include "trie.h"
 #include "http.h"
+#include "acism.h"
 
 typedef struct url_match_s {
        const gchar *m_begin;
@@ -673,8 +674,8 @@ struct url_matcher static_matchers[] = {
 
 struct url_match_scanner {
        GArray *matchers;
-       rspamd_trie_t *search_trie;
-       rspamd_trie_t *tld_trie;
+       GArray *patterns;
+       ac_trie_t *search_trie;
 };
 
 struct url_match_scanner *url_scanner = NULL;
@@ -827,6 +828,7 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
 {
        FILE *f;
        struct url_matcher m;
+       ac_trie_pat_t pat;
        gchar *linebuf = NULL, *p;
        gsize buflen = 0, patlen;
        gssize r;
@@ -876,8 +878,11 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
                patlen = strlen (p);
                m.pattern = g_malloc (patlen + 2);
                m.pattern[0] = '.';
+               pat.ptr = m.pattern;
+               pat.len = patlen + 1;
                rspamd_strlcpy (&m.pattern[1], p, patlen + 1);
                g_array_append_val (url_scanner->matchers, m);
+               g_array_append_val (url_scanner->patterns, pat);
        }
 
        free (linebuf);
@@ -885,27 +890,30 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
 }
 
 static void
-rspamd_url_add_static_matchers (GArray *matchers)
+rspamd_url_add_static_matchers (struct url_match_scanner *sc)
 {
-       gint n = G_N_ELEMENTS (static_matchers);
+       gint n = G_N_ELEMENTS (static_matchers), i;
+       ac_trie_pat_t pat;
 
-       g_array_append_vals (matchers, static_matchers, n);
+       g_array_append_vals (sc->matchers, static_matchers, n);
+
+       for (i = 0; i < n; i ++) {
+               pat.ptr = static_matchers[i].pattern;
+               pat.len = strlen (pat.ptr);
+               g_array_append_val (sc->patterns, pat);
+       }
 }
 
 void
 rspamd_url_init (const gchar *tld_file)
 {
-       guint i;
-       gchar patbuf[128];
-       struct url_matcher *m;
-
        if (url_scanner == NULL) {
                url_scanner = g_malloc (sizeof (struct url_match_scanner));
-               url_scanner->matchers = g_array_new (FALSE, TRUE,
-                               sizeof (struct url_matcher));
-               url_scanner->search_trie = rspamd_trie_create (TRUE);
-               url_scanner->tld_trie = rspamd_trie_create (TRUE);
-               rspamd_url_add_static_matchers (url_scanner->matchers);
+               url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
+                               sizeof (struct url_matcher), 512);
+               url_scanner->patterns = g_array_sized_new (FALSE, TRUE,
+                               sizeof (ac_trie_pat_t), 512);
+               rspamd_url_add_static_matchers (url_scanner);
 
                if (tld_file != NULL) {
                        rspamd_url_parse_tld_file (tld_file, url_scanner);
@@ -914,16 +922,11 @@ rspamd_url_init (const gchar *tld_file)
                        msg_warn ("tld extension file is not specified, url matching is limited");
                }
 
-               for (i = 0; i < url_scanner->matchers->len; i++) {
-                       m = &g_array_index (url_scanner->matchers, struct url_matcher, i);
-
-                       rspamd_trie_insert (url_scanner->search_trie, m->pattern, i);
+               url_scanner->search_trie = acism_create (
+                               (const ac_trie_pat_t *)url_scanner->patterns->data,
+                               url_scanner->patterns->len);
 
-                       /* Also use it for TLD lookups */
-                       if (strcmp (m->prefix, "http://") == 0) {
-                               rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i);
-                       }
-               }
+               msg_info ("initialized ac_trie of %ud elements", url_scanner->patterns->len);
        }
 }
 
@@ -1822,12 +1825,11 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
        struct mime_text_part *part,
        gboolean is_html)
 {
-       gint rc;
-       gchar *url_str = NULL, *url_start, *url_end;
+       gint rc, state = 0;
+       gchar *url_str = NULL;
        struct rspamd_url *new;
        struct process_exception *ex;
-       gchar *p, *end, *begin;
-
+       const gchar *p, *end, *begin, *url_start, *url_end;
 
        if (part->content == NULL || part->content->len == 0) {
                msg_warn ("got empty text part");
@@ -1839,7 +1841,7 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
        p = begin;
        while (p < end) {
                if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
-                               is_html)) {
+                               is_html, &state)) {
                        if (url_str != NULL) {
                                new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
                                ex =
@@ -1889,67 +1891,97 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
        }
 }
 
-gboolean
-rspamd_url_find (rspamd_mempool_t *pool,
-       const gchar *begin,
-       gsize len,
-       gchar **start,
-       gchar **fin,
-       gchar **url_str,
-       gboolean is_html)
+struct url_callback_data {
+       const gchar *begin;
+       gchar *url_str;
+       rspamd_mempool_t *pool;
+       gint len;
+       gboolean is_html;
+       const gchar *start;
+       const gchar *fin;
+       const gchar *end;
+};
+
+static gint
+rspamd_url_trie_callback (int strnum, int textpos, void *context)
 {
-       const gchar *end, *pos;
-       gint idx, l;
        struct url_matcher *matcher;
        url_match_t m;
+       const gchar *pos;
+       struct url_callback_data *cb = context;
 
-       end = begin + len;
-       if ((pos =
-                       rspamd_trie_lookup (url_scanner->search_trie, begin, len,
-                                       &idx)) == NULL) {
-               return FALSE;
+       matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum);
+       if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+               /* Do not try to match non-html like urls in html texts */
+               return 0;
+       }
+
+       m.pattern = matcher->pattern;
+       m.prefix = matcher->prefix;
+       m.add_prefix = FALSE;
+       pos = cb->begin + textpos;
+
+       if (matcher->start (cb->begin, cb->end, pos,
+                       &m) && matcher->end (cb->begin, cb->end, pos, &m)) {
+               if (m.add_prefix || matcher->prefix[0] != '\0') {
+                       cb->len = m.m_len + strlen (m.prefix);
+                       cb->url_str = rspamd_mempool_alloc (cb->pool, cb->len + 1);
+                       rspamd_snprintf (cb->url_str,
+                                       cb->len,
+                                       "%s%*s",
+                                       m.prefix,
+                                       m.m_len,
+                                       m.m_begin);
+               }
+               else {
+                       cb->url_str = rspamd_mempool_alloc (cb->pool, m.m_len + 1);
+                       rspamd_strlcpy (cb->url_str, m.m_begin, m.m_len + 1);
+               }
+
+               cb->start = (gchar *)m.m_begin;
+               cb->fin = (gchar *)m.m_begin + m.m_len;
+
+               return 1;
        }
        else {
-               matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx);
-               if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
-                       /* Do not try to match non-html like urls in html texts */
-                       return FALSE;
+               cb->url_str = NULL;
+       }
+
+       /* Continue search */
+       return 0;
+}
+
+gboolean
+rspamd_url_find (rspamd_mempool_t *pool,
+       const gchar *begin,
+       gsize len,
+       const gchar **start,
+       const gchar **fin,
+       gchar **url_str,
+       gboolean is_html,
+       gint *statep)
+{
+       struct url_callback_data cb;
+       gint ret;
+
+       g_assert (statep != NULL);
+       memset (&cb, 0, sizeof (cb));
+       cb.begin = begin;
+       cb.end = begin + len;
+       cb.is_html = is_html;
+       cb.pool = pool;
+       ret = acism_lookup (url_scanner->search_trie, begin, len,
+                       rspamd_url_trie_callback, &cb, statep);
+
+       if (ret) {
+               if (start) {
+                       *start = cb.start;
                }
-               m.pattern = matcher->pattern;
-               m.prefix = matcher->prefix;
-               m.add_prefix = FALSE;
-               if (matcher->start (begin, end, pos,
-                               &m) && matcher->end (begin, end, pos, &m)) {
-                       if (m.add_prefix || matcher->prefix[0] != '\0') {
-                               l = m.m_len + 1 + strlen (m.prefix);
-                               *url_str = rspamd_mempool_alloc (pool, l);
-                               rspamd_snprintf (*url_str,
-                                               l,
-                                               "%s%*s",
-                                               m.prefix,
-                                               m.m_len,
-                                               m.m_begin);
-                       }
-                       else {
-                               *url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
-                               memcpy (*url_str, m.m_begin, m.m_len);
-                               (*url_str)[m.m_len] = '\0';
-                       }
-                       if (start != NULL) {
-                               *start = (gchar *)m.m_begin;
-                       }
-                       if (fin != NULL) {
-                               *fin = (gchar *)m.m_begin + m.m_len;
-                       }
+               if (fin) {
+                       *fin = cb.fin;
                }
-               else {
-                       *url_str = NULL;
-                       if (start != NULL) {
-                               *start = (gchar *)pos;
-                       }
-                       if (fin != NULL) {
-                               *fin = (gchar *)pos + strlen (m.prefix);
-                       }
+               if (url_str) {
+                       *url_str = cb.url_str;
                }
 
                return TRUE;
@@ -1960,10 +1992,10 @@ rspamd_url_find (rspamd_mempool_t *pool,
 
 struct rspamd_url *
 rspamd_url_get_next (rspamd_mempool_t *pool,
-               const gchar *start, gchar const **pos)
+               const gchar *start, gchar const **pos, gint *statep)
 {
-       const gchar *p, *end;
-       gchar *url_str = NULL, *url_start, *url_end;
+       const gchar *p, *end, *url_start, *url_end;
+       gchar *url_str = NULL;
        struct rspamd_url *new;
        gint rc;
 
@@ -1978,7 +2010,7 @@ rspamd_url_get_next (rspamd_mempool_t *pool,
 
        if (p < end) {
                if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
-                               FALSE)) {
+                               FALSE, statep)) {
                        if (url_str != NULL) {
                                new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
 
index 8dc2a70321a97e0ff2d9a4b525e44c35c12983f1..1c76294cd23cd9f449a4cc9b116a95bc24bbfed8 100644 (file)
@@ -104,11 +104,11 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
 gboolean rspamd_url_find (rspamd_mempool_t *pool,
        const gchar *begin,
        gsize len,
-       gchar **start,
-       gchar **end,
+       const gchar **start,
+       const gchar **end,
        gchar **url_str,
-       gboolean is_html);
-
+       gboolean is_html,
+       gint *statep);
 /*
  * Return text representation of url parsing error
  */
@@ -123,6 +123,6 @@ const gchar * rspamd_url_strerror (enum uri_errno err);
  */
 struct rspamd_url *
 rspamd_url_get_next (rspamd_mempool_t *pool,
-               const gchar *start, gchar const **pos);
+               const gchar *start, gchar const **pos, gint *statep);
 
 #endif