Use new ac_trie for url extraction.

This commit is contained in:
Vsevolod Stakhov 2015-04-06 18:03:49 +01:00
parent 680a2b66cb
commit d6724d926d
6 changed files with 142 additions and 97 deletions

View File

@ -24,11 +24,11 @@
int
acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
ACISM_ACTION *cb, void *context)
ACISM_ACTION *cb, void *context, int *statep)
{
ac_trie_t const ps = *psp;
char const *cp = text, *endp = cp + len;
STATE state = 0;
STATE state = *statep;
int ret = 0;
while (cp < endp) {
@ -102,6 +102,18 @@ acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
}
}
EXIT:
*statep = state;
return ret;
}
void
acism_destroy(ac_trie_t *psp)
{
if (!psp) return;
if (psp->flags & IS_MMAP)
munmap((char*)psp->tranv - sizeof(ac_trie_t),
sizeof(ac_trie_t) + p_size(psp));
else free(psp->tranv);
free(psp);
}
//EOF

View File

@ -46,6 +46,6 @@ typedef int (ACISM_ACTION)(int strnum, int textpos, void *context);
// *state should initially be (0).
int acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
ACISM_ACTION *cb, void *context);
ACISM_ACTION *cb, void *context, int *statep);
#endif//ACISM_H

View File

@ -1517,10 +1517,11 @@ process_message (struct rspamd_task *task)
GMimePart *part;
GMimeDataWrapper *wrapper;
struct received_header *recv;
gchar *mid, *url_str, *p, *end, *url_end;
gchar *mid, *url_str;
const gchar *url_end, *p, *end;
struct rspamd_url *subject_url;
gsize len;
gint rc;
gint rc, state = 0;
tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
tmp->data = (guint8 *)task->msg.start;
@ -1708,7 +1709,7 @@ process_message (struct rspamd_task *task)
while (p < end) {
/* Search to the end of url */
if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
&url_str, FALSE)) {
&url_str, FALSE, &state)) {
if (url_str != NULL) {
subject_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));

View File

@ -682,7 +682,7 @@ check_phishing (struct rspamd_task *task,
gchar tagbuf[128];
struct html_tag *tag;
gsize len = 0;
gint rc;
gint rc, state = 0;
p = url_text;
while (len < remain) {
@ -730,7 +730,7 @@ check_phishing (struct rspamd_task *task,
}
if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
TRUE) && url_str != NULL) {
TRUE, &state) && url_str != NULL) {
new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
g_strstrip (url_str);
rc = rspamd_url_parse (new, url_str, strlen (url_str), task->task_pool);

View File

@ -32,6 +32,7 @@
#include "message.h"
#include "trie.h"
#include "http.h"
#include "acism.h"
typedef struct url_match_s {
const gchar *m_begin;
@ -673,8 +674,8 @@ struct url_matcher static_matchers[] = {
struct url_match_scanner {
GArray *matchers;
rspamd_trie_t *search_trie;
rspamd_trie_t *tld_trie;
GArray *patterns;
ac_trie_t *search_trie;
};
struct url_match_scanner *url_scanner = NULL;
@ -827,6 +828,7 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
{
FILE *f;
struct url_matcher m;
ac_trie_pat_t pat;
gchar *linebuf = NULL, *p;
gsize buflen = 0, patlen;
gssize r;
@ -876,8 +878,11 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
patlen = strlen (p);
m.pattern = g_malloc (patlen + 2);
m.pattern[0] = '.';
pat.ptr = m.pattern;
pat.len = patlen + 1;
rspamd_strlcpy (&m.pattern[1], p, patlen + 1);
g_array_append_val (url_scanner->matchers, m);
g_array_append_val (url_scanner->patterns, pat);
}
free (linebuf);
@ -885,27 +890,30 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
}
static void
rspamd_url_add_static_matchers (GArray *matchers)
rspamd_url_add_static_matchers (struct url_match_scanner *sc)
{
gint n = G_N_ELEMENTS (static_matchers);
gint n = G_N_ELEMENTS (static_matchers), i;
ac_trie_pat_t pat;
g_array_append_vals (matchers, static_matchers, n);
g_array_append_vals (sc->matchers, static_matchers, n);
for (i = 0; i < n; i ++) {
pat.ptr = static_matchers[i].pattern;
pat.len = strlen (pat.ptr);
g_array_append_val (sc->patterns, pat);
}
}
void
rspamd_url_init (const gchar *tld_file)
{
guint i;
gchar patbuf[128];
struct url_matcher *m;
if (url_scanner == NULL) {
url_scanner = g_malloc (sizeof (struct url_match_scanner));
url_scanner->matchers = g_array_new (FALSE, TRUE,
sizeof (struct url_matcher));
url_scanner->search_trie = rspamd_trie_create (TRUE);
url_scanner->tld_trie = rspamd_trie_create (TRUE);
rspamd_url_add_static_matchers (url_scanner->matchers);
url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
sizeof (struct url_matcher), 512);
url_scanner->patterns = g_array_sized_new (FALSE, TRUE,
sizeof (ac_trie_pat_t), 512);
rspamd_url_add_static_matchers (url_scanner);
if (tld_file != NULL) {
rspamd_url_parse_tld_file (tld_file, url_scanner);
@ -914,16 +922,11 @@ rspamd_url_init (const gchar *tld_file)
msg_warn ("tld extension file is not specified, url matching is limited");
}
for (i = 0; i < url_scanner->matchers->len; i++) {
m = &g_array_index (url_scanner->matchers, struct url_matcher, i);
url_scanner->search_trie = acism_create (
(const ac_trie_pat_t *)url_scanner->patterns->data,
url_scanner->patterns->len);
rspamd_trie_insert (url_scanner->search_trie, m->pattern, i);
/* Also use it for TLD lookups */
if (strcmp (m->prefix, "http://") == 0) {
rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i);
}
}
msg_info ("initialized ac_trie of %ud elements", url_scanner->patterns->len);
}
}
@ -1822,12 +1825,11 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
struct mime_text_part *part,
gboolean is_html)
{
gint rc;
gchar *url_str = NULL, *url_start, *url_end;
gint rc, state = 0;
gchar *url_str = NULL;
struct rspamd_url *new;
struct process_exception *ex;
gchar *p, *end, *begin;
const gchar *p, *end, *begin, *url_start, *url_end;
if (part->content == NULL || part->content->len == 0) {
msg_warn ("got empty text part");
@ -1839,7 +1841,7 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
p = begin;
while (p < end) {
if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
is_html)) {
is_html, &state)) {
if (url_str != NULL) {
new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
ex =
@ -1889,67 +1891,97 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
}
}
struct url_callback_data {
const gchar *begin;
gchar *url_str;
rspamd_mempool_t *pool;
gint len;
gboolean is_html;
const gchar *start;
const gchar *fin;
const gchar *end;
};
static gint
rspamd_url_trie_callback (int strnum, int textpos, void *context)
{
struct url_matcher *matcher;
url_match_t m;
const gchar *pos;
struct url_callback_data *cb = context;
matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum);
if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
m.pattern = matcher->pattern;
m.prefix = matcher->prefix;
m.add_prefix = FALSE;
pos = cb->begin + textpos;
if (matcher->start (cb->begin, cb->end, pos,
&m) && matcher->end (cb->begin, cb->end, pos, &m)) {
if (m.add_prefix || matcher->prefix[0] != '\0') {
cb->len = m.m_len + strlen (m.prefix);
cb->url_str = rspamd_mempool_alloc (cb->pool, cb->len + 1);
rspamd_snprintf (cb->url_str,
cb->len,
"%s%*s",
m.prefix,
m.m_len,
m.m_begin);
}
else {
cb->url_str = rspamd_mempool_alloc (cb->pool, m.m_len + 1);
rspamd_strlcpy (cb->url_str, m.m_begin, m.m_len + 1);
}
cb->start = (gchar *)m.m_begin;
cb->fin = (gchar *)m.m_begin + m.m_len;
return 1;
}
else {
cb->url_str = NULL;
}
/* Continue search */
return 0;
}
gboolean
rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin,
gsize len,
gchar **start,
gchar **fin,
const gchar **start,
const gchar **fin,
gchar **url_str,
gboolean is_html)
gboolean is_html,
gint *statep)
{
const gchar *end, *pos;
gint idx, l;
struct url_matcher *matcher;
url_match_t m;
struct url_callback_data cb;
gint ret;
end = begin + len;
if ((pos =
rspamd_trie_lookup (url_scanner->search_trie, begin, len,
&idx)) == NULL) {
return FALSE;
}
else {
matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx);
if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
/* Do not try to match non-html like urls in html texts */
return FALSE;
g_assert (statep != NULL);
memset (&cb, 0, sizeof (cb));
cb.begin = begin;
cb.end = begin + len;
cb.is_html = is_html;
cb.pool = pool;
ret = acism_lookup (url_scanner->search_trie, begin, len,
rspamd_url_trie_callback, &cb, statep);
if (ret) {
if (start) {
*start = cb.start;
}
m.pattern = matcher->pattern;
m.prefix = matcher->prefix;
m.add_prefix = FALSE;
if (matcher->start (begin, end, pos,
&m) && matcher->end (begin, end, pos, &m)) {
if (m.add_prefix || matcher->prefix[0] != '\0') {
l = m.m_len + 1 + strlen (m.prefix);
*url_str = rspamd_mempool_alloc (pool, l);
rspamd_snprintf (*url_str,
l,
"%s%*s",
m.prefix,
m.m_len,
m.m_begin);
}
else {
*url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
memcpy (*url_str, m.m_begin, m.m_len);
(*url_str)[m.m_len] = '\0';
}
if (start != NULL) {
*start = (gchar *)m.m_begin;
}
if (fin != NULL) {
*fin = (gchar *)m.m_begin + m.m_len;
}
if (fin) {
*fin = cb.fin;
}
else {
*url_str = NULL;
if (start != NULL) {
*start = (gchar *)pos;
}
if (fin != NULL) {
*fin = (gchar *)pos + strlen (m.prefix);
}
if (url_str) {
*url_str = cb.url_str;
}
return TRUE;
@ -1960,10 +1992,10 @@ rspamd_url_find (rspamd_mempool_t *pool,
struct rspamd_url *
rspamd_url_get_next (rspamd_mempool_t *pool,
const gchar *start, gchar const **pos)
const gchar *start, gchar const **pos, gint *statep)
{
const gchar *p, *end;
gchar *url_str = NULL, *url_start, *url_end;
const gchar *p, *end, *url_start, *url_end;
gchar *url_str = NULL;
struct rspamd_url *new;
gint rc;
@ -1978,7 +2010,7 @@ rspamd_url_get_next (rspamd_mempool_t *pool,
if (p < end) {
if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
FALSE)) {
FALSE, statep)) {
if (url_str != NULL) {
new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));

View File

@ -104,11 +104,11 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
gboolean rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin,
gsize len,
gchar **start,
gchar **end,
const gchar **start,
const gchar **end,
gchar **url_str,
gboolean is_html);
gboolean is_html,
gint *statep);
/*
* Return text representation of url parsing error
*/
@ -123,6 +123,6 @@ const gchar * rspamd_url_strerror (enum uri_errno err);
*/
struct rspamd_url *
rspamd_url_get_next (rspamd_mempool_t *pool,
const gchar *start, gchar const **pos);
const gchar *start, gchar const **pos, gint *statep);
#endif