mirror of
https://github.com/rspamd/rspamd.git
synced 2024-07-29 08:17:22 +02:00
Use new ac_trie for url extraction.
This commit is contained in:
parent
680a2b66cb
commit
d6724d926d
@ -24,11 +24,11 @@
|
||||
|
||||
int
|
||||
acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
|
||||
ACISM_ACTION *cb, void *context)
|
||||
ACISM_ACTION *cb, void *context, int *statep)
|
||||
{
|
||||
ac_trie_t const ps = *psp;
|
||||
char const *cp = text, *endp = cp + len;
|
||||
STATE state = 0;
|
||||
STATE state = *statep;
|
||||
int ret = 0;
|
||||
|
||||
while (cp < endp) {
|
||||
@ -102,6 +102,18 @@ acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
|
||||
}
|
||||
}
|
||||
EXIT:
|
||||
*statep = state;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
acism_destroy(ac_trie_t *psp)
|
||||
{
|
||||
if (!psp) return;
|
||||
if (psp->flags & IS_MMAP)
|
||||
munmap((char*)psp->tranv - sizeof(ac_trie_t),
|
||||
sizeof(ac_trie_t) + p_size(psp));
|
||||
else free(psp->tranv);
|
||||
free(psp);
|
||||
}
|
||||
//EOF
|
||||
|
@ -46,6 +46,6 @@ typedef int (ACISM_ACTION)(int strnum, int textpos, void *context);
|
||||
// *state should initially be (0).
|
||||
|
||||
int acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
|
||||
ACISM_ACTION *cb, void *context);
|
||||
ACISM_ACTION *cb, void *context, int *statep);
|
||||
|
||||
#endif//ACISM_H
|
||||
|
@ -1517,10 +1517,11 @@ process_message (struct rspamd_task *task)
|
||||
GMimePart *part;
|
||||
GMimeDataWrapper *wrapper;
|
||||
struct received_header *recv;
|
||||
gchar *mid, *url_str, *p, *end, *url_end;
|
||||
gchar *mid, *url_str;
|
||||
const gchar *url_end, *p, *end;
|
||||
struct rspamd_url *subject_url;
|
||||
gsize len;
|
||||
gint rc;
|
||||
gint rc, state = 0;
|
||||
|
||||
tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
|
||||
tmp->data = (guint8 *)task->msg.start;
|
||||
@ -1708,7 +1709,7 @@ process_message (struct rspamd_task *task)
|
||||
while (p < end) {
|
||||
/* Search to the end of url */
|
||||
if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
|
||||
&url_str, FALSE)) {
|
||||
&url_str, FALSE, &state)) {
|
||||
if (url_str != NULL) {
|
||||
subject_url = rspamd_mempool_alloc0 (task->task_pool,
|
||||
sizeof (struct rspamd_url));
|
||||
|
@ -682,7 +682,7 @@ check_phishing (struct rspamd_task *task,
|
||||
gchar tagbuf[128];
|
||||
struct html_tag *tag;
|
||||
gsize len = 0;
|
||||
gint rc;
|
||||
gint rc, state = 0;
|
||||
|
||||
p = url_text;
|
||||
while (len < remain) {
|
||||
@ -730,7 +730,7 @@ check_phishing (struct rspamd_task *task,
|
||||
}
|
||||
|
||||
if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
|
||||
TRUE) && url_str != NULL) {
|
||||
TRUE, &state) && url_str != NULL) {
|
||||
new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
|
||||
g_strstrip (url_str);
|
||||
rc = rspamd_url_parse (new, url_str, strlen (url_str), task->task_pool);
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "message.h"
|
||||
#include "trie.h"
|
||||
#include "http.h"
|
||||
#include "acism.h"
|
||||
|
||||
typedef struct url_match_s {
|
||||
const gchar *m_begin;
|
||||
@ -673,8 +674,8 @@ struct url_matcher static_matchers[] = {
|
||||
|
||||
struct url_match_scanner {
|
||||
GArray *matchers;
|
||||
rspamd_trie_t *search_trie;
|
||||
rspamd_trie_t *tld_trie;
|
||||
GArray *patterns;
|
||||
ac_trie_t *search_trie;
|
||||
};
|
||||
|
||||
struct url_match_scanner *url_scanner = NULL;
|
||||
@ -827,6 +828,7 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
|
||||
{
|
||||
FILE *f;
|
||||
struct url_matcher m;
|
||||
ac_trie_pat_t pat;
|
||||
gchar *linebuf = NULL, *p;
|
||||
gsize buflen = 0, patlen;
|
||||
gssize r;
|
||||
@ -876,8 +878,11 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
|
||||
patlen = strlen (p);
|
||||
m.pattern = g_malloc (patlen + 2);
|
||||
m.pattern[0] = '.';
|
||||
pat.ptr = m.pattern;
|
||||
pat.len = patlen + 1;
|
||||
rspamd_strlcpy (&m.pattern[1], p, patlen + 1);
|
||||
g_array_append_val (url_scanner->matchers, m);
|
||||
g_array_append_val (url_scanner->patterns, pat);
|
||||
}
|
||||
|
||||
free (linebuf);
|
||||
@ -885,27 +890,30 @@ rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner
|
||||
}
|
||||
|
||||
static void
|
||||
rspamd_url_add_static_matchers (GArray *matchers)
|
||||
rspamd_url_add_static_matchers (struct url_match_scanner *sc)
|
||||
{
|
||||
gint n = G_N_ELEMENTS (static_matchers);
|
||||
gint n = G_N_ELEMENTS (static_matchers), i;
|
||||
ac_trie_pat_t pat;
|
||||
|
||||
g_array_append_vals (matchers, static_matchers, n);
|
||||
g_array_append_vals (sc->matchers, static_matchers, n);
|
||||
|
||||
for (i = 0; i < n; i ++) {
|
||||
pat.ptr = static_matchers[i].pattern;
|
||||
pat.len = strlen (pat.ptr);
|
||||
g_array_append_val (sc->patterns, pat);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
rspamd_url_init (const gchar *tld_file)
|
||||
{
|
||||
guint i;
|
||||
gchar patbuf[128];
|
||||
struct url_matcher *m;
|
||||
|
||||
if (url_scanner == NULL) {
|
||||
url_scanner = g_malloc (sizeof (struct url_match_scanner));
|
||||
url_scanner->matchers = g_array_new (FALSE, TRUE,
|
||||
sizeof (struct url_matcher));
|
||||
url_scanner->search_trie = rspamd_trie_create (TRUE);
|
||||
url_scanner->tld_trie = rspamd_trie_create (TRUE);
|
||||
rspamd_url_add_static_matchers (url_scanner->matchers);
|
||||
url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
|
||||
sizeof (struct url_matcher), 512);
|
||||
url_scanner->patterns = g_array_sized_new (FALSE, TRUE,
|
||||
sizeof (ac_trie_pat_t), 512);
|
||||
rspamd_url_add_static_matchers (url_scanner);
|
||||
|
||||
if (tld_file != NULL) {
|
||||
rspamd_url_parse_tld_file (tld_file, url_scanner);
|
||||
@ -914,16 +922,11 @@ rspamd_url_init (const gchar *tld_file)
|
||||
msg_warn ("tld extension file is not specified, url matching is limited");
|
||||
}
|
||||
|
||||
for (i = 0; i < url_scanner->matchers->len; i++) {
|
||||
m = &g_array_index (url_scanner->matchers, struct url_matcher, i);
|
||||
url_scanner->search_trie = acism_create (
|
||||
(const ac_trie_pat_t *)url_scanner->patterns->data,
|
||||
url_scanner->patterns->len);
|
||||
|
||||
rspamd_trie_insert (url_scanner->search_trie, m->pattern, i);
|
||||
|
||||
/* Also use it for TLD lookups */
|
||||
if (strcmp (m->prefix, "http://") == 0) {
|
||||
rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i);
|
||||
}
|
||||
}
|
||||
msg_info ("initialized ac_trie of %ud elements", url_scanner->patterns->len);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1822,12 +1825,11 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
|
||||
struct mime_text_part *part,
|
||||
gboolean is_html)
|
||||
{
|
||||
gint rc;
|
||||
gchar *url_str = NULL, *url_start, *url_end;
|
||||
gint rc, state = 0;
|
||||
gchar *url_str = NULL;
|
||||
struct rspamd_url *new;
|
||||
struct process_exception *ex;
|
||||
gchar *p, *end, *begin;
|
||||
|
||||
const gchar *p, *end, *begin, *url_start, *url_end;
|
||||
|
||||
if (part->content == NULL || part->content->len == 0) {
|
||||
msg_warn ("got empty text part");
|
||||
@ -1839,7 +1841,7 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
|
||||
p = begin;
|
||||
while (p < end) {
|
||||
if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
|
||||
is_html)) {
|
||||
is_html, &state)) {
|
||||
if (url_str != NULL) {
|
||||
new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
|
||||
ex =
|
||||
@ -1889,67 +1891,97 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
|
||||
}
|
||||
}
|
||||
|
||||
struct url_callback_data {
|
||||
const gchar *begin;
|
||||
gchar *url_str;
|
||||
rspamd_mempool_t *pool;
|
||||
gint len;
|
||||
gboolean is_html;
|
||||
const gchar *start;
|
||||
const gchar *fin;
|
||||
const gchar *end;
|
||||
};
|
||||
|
||||
static gint
|
||||
rspamd_url_trie_callback (int strnum, int textpos, void *context)
|
||||
{
|
||||
struct url_matcher *matcher;
|
||||
url_match_t m;
|
||||
const gchar *pos;
|
||||
struct url_callback_data *cb = context;
|
||||
|
||||
matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum);
|
||||
if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
|
||||
/* Do not try to match non-html like urls in html texts */
|
||||
return 0;
|
||||
}
|
||||
|
||||
m.pattern = matcher->pattern;
|
||||
m.prefix = matcher->prefix;
|
||||
m.add_prefix = FALSE;
|
||||
pos = cb->begin + textpos;
|
||||
|
||||
if (matcher->start (cb->begin, cb->end, pos,
|
||||
&m) && matcher->end (cb->begin, cb->end, pos, &m)) {
|
||||
if (m.add_prefix || matcher->prefix[0] != '\0') {
|
||||
cb->len = m.m_len + strlen (m.prefix);
|
||||
cb->url_str = rspamd_mempool_alloc (cb->pool, cb->len + 1);
|
||||
rspamd_snprintf (cb->url_str,
|
||||
cb->len,
|
||||
"%s%*s",
|
||||
m.prefix,
|
||||
m.m_len,
|
||||
m.m_begin);
|
||||
}
|
||||
else {
|
||||
cb->url_str = rspamd_mempool_alloc (cb->pool, m.m_len + 1);
|
||||
rspamd_strlcpy (cb->url_str, m.m_begin, m.m_len + 1);
|
||||
}
|
||||
|
||||
cb->start = (gchar *)m.m_begin;
|
||||
cb->fin = (gchar *)m.m_begin + m.m_len;
|
||||
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
cb->url_str = NULL;
|
||||
}
|
||||
|
||||
/* Continue search */
|
||||
return 0;
|
||||
}
|
||||
|
||||
gboolean
|
||||
rspamd_url_find (rspamd_mempool_t *pool,
|
||||
const gchar *begin,
|
||||
gsize len,
|
||||
gchar **start,
|
||||
gchar **fin,
|
||||
const gchar **start,
|
||||
const gchar **fin,
|
||||
gchar **url_str,
|
||||
gboolean is_html)
|
||||
gboolean is_html,
|
||||
gint *statep)
|
||||
{
|
||||
const gchar *end, *pos;
|
||||
gint idx, l;
|
||||
struct url_matcher *matcher;
|
||||
url_match_t m;
|
||||
struct url_callback_data cb;
|
||||
gint ret;
|
||||
|
||||
end = begin + len;
|
||||
if ((pos =
|
||||
rspamd_trie_lookup (url_scanner->search_trie, begin, len,
|
||||
&idx)) == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
else {
|
||||
matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx);
|
||||
if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
|
||||
/* Do not try to match non-html like urls in html texts */
|
||||
return FALSE;
|
||||
g_assert (statep != NULL);
|
||||
memset (&cb, 0, sizeof (cb));
|
||||
cb.begin = begin;
|
||||
cb.end = begin + len;
|
||||
cb.is_html = is_html;
|
||||
cb.pool = pool;
|
||||
ret = acism_lookup (url_scanner->search_trie, begin, len,
|
||||
rspamd_url_trie_callback, &cb, statep);
|
||||
|
||||
if (ret) {
|
||||
if (start) {
|
||||
*start = cb.start;
|
||||
}
|
||||
m.pattern = matcher->pattern;
|
||||
m.prefix = matcher->prefix;
|
||||
m.add_prefix = FALSE;
|
||||
if (matcher->start (begin, end, pos,
|
||||
&m) && matcher->end (begin, end, pos, &m)) {
|
||||
if (m.add_prefix || matcher->prefix[0] != '\0') {
|
||||
l = m.m_len + 1 + strlen (m.prefix);
|
||||
*url_str = rspamd_mempool_alloc (pool, l);
|
||||
rspamd_snprintf (*url_str,
|
||||
l,
|
||||
"%s%*s",
|
||||
m.prefix,
|
||||
m.m_len,
|
||||
m.m_begin);
|
||||
}
|
||||
else {
|
||||
*url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
|
||||
memcpy (*url_str, m.m_begin, m.m_len);
|
||||
(*url_str)[m.m_len] = '\0';
|
||||
}
|
||||
if (start != NULL) {
|
||||
*start = (gchar *)m.m_begin;
|
||||
}
|
||||
if (fin != NULL) {
|
||||
*fin = (gchar *)m.m_begin + m.m_len;
|
||||
}
|
||||
if (fin) {
|
||||
*fin = cb.fin;
|
||||
}
|
||||
else {
|
||||
*url_str = NULL;
|
||||
if (start != NULL) {
|
||||
*start = (gchar *)pos;
|
||||
}
|
||||
if (fin != NULL) {
|
||||
*fin = (gchar *)pos + strlen (m.prefix);
|
||||
}
|
||||
if (url_str) {
|
||||
*url_str = cb.url_str;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
@ -1960,10 +1992,10 @@ rspamd_url_find (rspamd_mempool_t *pool,
|
||||
|
||||
struct rspamd_url *
|
||||
rspamd_url_get_next (rspamd_mempool_t *pool,
|
||||
const gchar *start, gchar const **pos)
|
||||
const gchar *start, gchar const **pos, gint *statep)
|
||||
{
|
||||
const gchar *p, *end;
|
||||
gchar *url_str = NULL, *url_start, *url_end;
|
||||
const gchar *p, *end, *url_start, *url_end;
|
||||
gchar *url_str = NULL;
|
||||
struct rspamd_url *new;
|
||||
gint rc;
|
||||
|
||||
@ -1978,7 +2010,7 @@ rspamd_url_get_next (rspamd_mempool_t *pool,
|
||||
|
||||
if (p < end) {
|
||||
if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
|
||||
FALSE)) {
|
||||
FALSE, statep)) {
|
||||
if (url_str != NULL) {
|
||||
new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
|
||||
|
||||
|
@ -104,11 +104,11 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
|
||||
gboolean rspamd_url_find (rspamd_mempool_t *pool,
|
||||
const gchar *begin,
|
||||
gsize len,
|
||||
gchar **start,
|
||||
gchar **end,
|
||||
const gchar **start,
|
||||
const gchar **end,
|
||||
gchar **url_str,
|
||||
gboolean is_html);
|
||||
|
||||
gboolean is_html,
|
||||
gint *statep);
|
||||
/*
|
||||
* Return text representation of url parsing error
|
||||
*/
|
||||
@ -123,6 +123,6 @@ const gchar * rspamd_url_strerror (enum uri_errno err);
|
||||
*/
|
||||
struct rspamd_url *
|
||||
rspamd_url_get_next (rspamd_mempool_t *pool,
|
||||
const gchar *start, gchar const **pos);
|
||||
const gchar *start, gchar const **pos, gint *statep);
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user