summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-04-14 13:07:40 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-04-14 13:07:40 +0100
commitf2f1ea684b61abb0c810a0a1fb26c07b0e019d06 (patch)
tree64d6096ed40f12eebdbd9cd5e76a5fdf7ad3bda9
parentda58466e4e5f47ab916db936580ed67d75218c28 (diff)
downloadrspamd-f2f1ea684b61abb0c810a0a1fb26c07b0e019d06.tar.gz
rspamd-f2f1ea684b61abb0c810a0a1fb26c07b0e019d06.zip
[Fix] Use multipattern in url matcher
-rw-r--r--src/libserver/html.c9
-rw-r--r--src/libserver/url.c533
-rw-r--r--src/libserver/url.h18
3 files changed, 283 insertions, 277 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 49034acf6..793f1c5a0 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -683,13 +683,12 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
gboolean *url_found)
{
struct rspamd_url *text_url;
- gint rc, state = 0;
+ gint rc;
gchar *url_str = NULL;
*url_found = FALSE;
- if (rspamd_url_find (pool, url_text, len, NULL, NULL, &url_str,
- TRUE, &state) && url_str != NULL) {
+ if (rspamd_url_find (pool, url_text, len, &url_str, TRUE) && url_str != NULL) {
text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);
@@ -1235,15 +1234,13 @@ static void
rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
GHashTable *target)
{
- gint nstate = 0;
struct rspamd_url *query_url;
gchar *url_str;
gint rc;
if (url->querylen > 0) {
- if (rspamd_url_find (pool, url->query, url->querylen, NULL, NULL,
- &url_str, TRUE, &nstate)) {
+ if (rspamd_url_find (pool, url->query, url->querylen, &url_str, TRUE)) {
query_url = rspamd_mempool_alloc0 (pool,
sizeof (struct rspamd_url));
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 700ffe34b..94b7964fb 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -46,7 +46,7 @@
#include "rspamd.h"
#include "message.h"
#include "http.h"
-#include "acism.h"
+#include "multipattern.h"
#include "http_parser.h"
typedef struct url_match_s {
@@ -65,7 +65,7 @@ typedef struct url_match_s {
struct url_callback_data;
struct url_matcher {
- gchar *pattern;
+ const gchar *pattern;
const gchar *prefix;
gboolean (*start) (struct url_callback_data *cb,
@@ -77,6 +77,7 @@ struct url_matcher {
url_match_t *match);
gint flags;
+ gsize patlen;
};
static gboolean url_file_start (struct url_callback_data *cb,
@@ -114,38 +115,38 @@ static gboolean url_email_end (struct url_callback_data *cb,
struct url_matcher static_matchers[] = {
/* Common prefixes */
{"file://", "", url_file_start, url_file_end,
- 0},
+ 0, 0},
{"ftp://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"sftp://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"http://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"https://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"news://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"nntp://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"telnet://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"webcal://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"mailto:", "", url_email_start, url_email_end,
- 0},
+ 0, 0},
{"callto://", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"h323:", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"sip:", "", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"www.", "http://", url_web_start, url_web_end,
- 0},
+ 0, 0},
{"ftp.", "ftp://", url_web_start, url_web_end,
- URL_FLAG_NOHTML},
+ URL_FLAG_NOHTML, 0},
/* Likely emails */
{"@", "mailto://", url_email_start, url_email_end,
- URL_FLAG_NOHTML}
+ URL_FLAG_NOHTML, 0}
};
struct url_callback_data {
@@ -158,12 +159,13 @@ struct url_callback_data {
const gchar *fin;
const gchar *end;
const gchar *last_at;
+ url_insert_function func;
+ void *funcd;
};
struct url_match_scanner {
GArray *matchers;
- GArray *patterns;
- ac_trie_t *search_trie;
+ struct rspamd_multipattern *search_trie;
};
struct url_match_scanner *url_scanner = NULL;
@@ -339,9 +341,8 @@ rspamd_url_parse_tld_file (const gchar *fname,
{
FILE *f;
struct url_matcher m;
- ac_trie_pat_t pat;
gchar *linebuf = NULL, *p;
- gsize buflen = 0, patlen;
+ gsize buflen = 0;
gssize r;
gint flags;
@@ -372,6 +373,7 @@ rspamd_url_parse_tld_file (const gchar *fname,
flags = URL_FLAG_NOHTML | URL_FLAG_TLD_MATCH;
+#ifndef WITH_HYPERSCAN
if (linebuf[0] == '*') {
flags |= URL_FLAG_STAR_MATCH;
p = strchr (linebuf, '.');
@@ -385,16 +387,16 @@ rspamd_url_parse_tld_file (const gchar *fname,
else {
p = linebuf;
}
+#else
+ p = linebuf;
+#endif
- patlen = strlen (p);
- m.pattern = g_malloc (patlen + 2);
- m.pattern[0] = '.';
m.flags = flags;
- pat.ptr = m.pattern;
- pat.len = patlen + 1;
- rspamd_strlcpy (&m.pattern[1], p, patlen + 1);
+ rspamd_multipattern_add_pattern (url_scanner->search_trie, p);
+ m.pattern = rspamd_multipattern_get_pattern (url_scanner->search_trie,
+ rspamd_multipattern_get_npatterns (url_scanner->search_trie) - 1);
+ m.patlen = strlen (m.pattern);
g_array_append_val (url_scanner->matchers, m);
- g_array_append_val (url_scanner->patterns, pat);
}
free (linebuf);
@@ -405,26 +407,26 @@ static void
rspamd_url_add_static_matchers (struct url_match_scanner *sc)
{
gint n = G_N_ELEMENTS (static_matchers), i;
- ac_trie_pat_t pat;
g_array_append_vals (sc->matchers, static_matchers, n);
for (i = 0; i < n; i++) {
- pat.ptr = static_matchers[i].pattern;
- pat.len = strlen (pat.ptr);
- g_array_append_val (sc->patterns, pat);
+ rspamd_multipattern_add_pattern (url_scanner->search_trie,
+ static_matchers[i].pattern);
}
}
void
rspamd_url_init (const gchar *tld_file)
{
+ GError *err = NULL;
+
if (url_scanner == NULL) {
url_scanner = g_malloc (sizeof (struct url_match_scanner));
url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
sizeof (struct url_matcher), 512);
- url_scanner->patterns = g_array_sized_new (FALSE, TRUE,
- sizeof (ac_trie_pat_t), 512);
+ url_scanner->search_trie = rspamd_multipattern_create_sized (512,
+ RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE);
rspamd_url_add_static_matchers (url_scanner);
if (tld_file != NULL) {
@@ -435,12 +437,14 @@ rspamd_url_init (const gchar *tld_file)
"tld extension file is not specified, url matching is limited");
}
- url_scanner->search_trie = acism_create (
- (const ac_trie_pat_t *) url_scanner->patterns->data,
- url_scanner->patterns->len);
+ if (!rspamd_multipattern_compile (url_scanner->search_trie, &err)) {
+ msg_err ("cannot compile tld patterns, url matching will be "
+ "broken completely: %e", err);
+ g_error_free (err);
+ }
- msg_info ("initialized ac_trie of %ud elements",
- url_scanner->patterns->len);
+ msg_info ("initialized trie of %ud elements",
+ url_scanner->matchers->len);
}
}
@@ -1078,31 +1082,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
#undef SET_U
static gint
-rspamd_tld_trie_callback (int strnum, int textpos, void *context)
+rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
{
struct url_matcher *matcher;
const gchar *start, *pos, *p;
struct rspamd_url *url = context;
- ac_trie_pat_t *pat;
gint ndots = 1;
matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
strnum);
- pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
if (matcher->flags & URL_FLAG_STAR_MATCH) {
/* Skip one more tld component */
ndots = 2;
}
- pos = url->host + textpos - pat->len;
+ pos = text + match_start;
p = pos - 1;
start = url->host;
- if (*pos != '.' || textpos != (gint) url->hostlen) {
+ if (*pos != '.' || match_pos != (gint) url->hostlen) {
/* Something weird has been found */
- if (textpos == (gint) url->hostlen - 1) {
- pos = url->host + textpos;
+ if (match_pos == (gint) url->hostlen - 1) {
+ pos = url->host + match_pos;
if (*pos == '.') {
/* This is dot at the end of domain */
url->hostlen--;
@@ -1401,7 +1409,6 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
const gchar *end;
guint i, complen, ret;
gsize unquoted_len = 0;
- gint state = 0;
const struct {
enum rspamd_url_protocol proto;
@@ -1557,8 +1564,9 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
}
/* Find TLD part */
- if (acism_lookup (url_scanner->search_trie, uri->host, uri->hostlen,
- rspamd_tld_trie_callback, uri, &state, true) == 0) {
+ if (rspamd_multipattern_lookup (url_scanner->search_trie,
+ uri->host, uri->hostlen,
+ rspamd_tld_trie_callback, uri, NULL) == 0) {
/* Ignore URL's without TLD if it is not a numeric URL */
if (!rspamd_url_is_ip (uri, pool)) {
return URI_ERRNO_TLD_MISSING;
@@ -1579,30 +1587,34 @@ struct tld_trie_cbdata {
};
static gint
-rspamd_tld_trie_find_callback (int strnum, int textpos, void *context)
+rspamd_tld_trie_find_callback (struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
{
struct url_matcher *matcher;
const gchar *start, *pos, *p;
struct tld_trie_cbdata *cbdata = context;
- ac_trie_pat_t *pat;
gint ndots = 1;
matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
strnum);
- pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
if (matcher->flags & URL_FLAG_STAR_MATCH) {
/* Skip one more tld component */
ndots = 2;
}
- pos = cbdata->begin + textpos - pat->len;
+ pos = text + match_start;
p = pos - 1;
- start = cbdata->begin;
+ start = text;
- if (*pos != '.' || textpos != (gint)cbdata->len) {
+ if (*pos != '.' || match_pos != (gint)cbdata->len) {
/* Something weird has been found */
- if (textpos != (gint)cbdata->len - 1) {
+ if (match_pos != (gint)cbdata->len - 1) {
/* Search more */
return 0;
}
@@ -1632,7 +1644,6 @@ gboolean
rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out)
{
struct tld_trie_cbdata cbdata;
- gint state = 0;
g_assert (in != NULL);
g_assert (out != NULL);
@@ -1642,8 +1653,8 @@ rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out)
cbdata.len = inlen;
cbdata.out = out;
- if (acism_lookup (url_scanner->search_trie, in, inlen,
- rspamd_tld_trie_find_callback, &cbdata, &state, true) == 0) {
+ if (rspamd_multipattern_lookup (url_scanner->search_trie, in, inlen,
+ rspamd_tld_trie_find_callback, &cbdata, NULL) == 0) {
return FALSE;
}
@@ -1966,160 +1977,67 @@ url_email_end (struct url_callback_data *cb,
return FALSE;
}
-void
-rspamd_url_text_extract (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct mime_text_part *part,
- gboolean is_html)
+static gboolean
+rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos,
+ const gchar *end)
{
- gint rc, state = 0;
- gchar *url_str = NULL;
- struct rspamd_url *url;
- struct process_exception *ex;
- const gchar *p, *end, *begin, *url_start, *url_end;
-
- if (part->content == NULL || part->content->len == 0) {
- msg_warn_task ("got empty text part");
- return;
- }
-
- begin = part->content->data;
- end = begin + part->content->len;
- p = begin;
- while (p < end) {
- if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
- is_html, &state)) {
- if (url_str != NULL) {
- url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- ex =
- rspamd_mempool_alloc0 (pool,
- sizeof (struct process_exception));
- if (url != NULL) {
- g_strstrip (url_str);
- rc = rspamd_url_parse (url, url_str, strlen (url_str),
- pool);
- if (rc == URI_ERRNO_OK &&
- url->hostlen > 0) {
- ex->pos = url_start - begin;
- ex->len = url_end - url_start;
- if (url->protocol == PROTOCOL_MAILTO) {
- if (url->userlen > 0) {
- if (!g_hash_table_lookup (task->emails, url)) {
- g_hash_table_insert (task->emails, url,
- url);
- }
- }
- }
- else {
- if (!g_hash_table_lookup (task->urls, url)) {
- g_hash_table_insert (task->urls, url, url);
- }
- }
- part->urls_offset = g_list_prepend (
- part->urls_offset,
- ex);
-
- /* We also search the query for additional url inside */
- if (url->querylen > 0) {
- gint nstate = 0;
- struct rspamd_url *query_url;
-
- if (rspamd_url_find (pool,
- url->query,
- url->querylen,
- NULL,
- NULL,
- &url_str,
- is_html,
- &nstate)) {
-
- query_url = rspamd_mempool_alloc0 (pool,
- sizeof (struct rspamd_url));
- rc = rspamd_url_parse (query_url,
- url_str,
- strlen (url_str),
- pool);
- if (rc == URI_ERRNO_OK &&
- url->hostlen > 0) {
- msg_debug_task ("found url %s in query of url"
- " %*s", url_str, url->querylen, url->query);
-
- if (!g_hash_table_lookup (task->urls,
- query_url)) {
- g_hash_table_insert (task->urls,
- query_url,
- query_url);
- }
- }
- }
+ if (matcher->flags & URL_FLAG_TLD_MATCH) {
+ /* Immediately check pos for valid chars */
+ if (pos < end) {
+ if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' &&
+ *pos != ':' && !is_url_end (*pos)) {
+ if (*pos == '.') {
+ /* We allow . at the end of the domain however */
+ pos++;
+ if (pos < end) {
+ if (!g_ascii_isspace (*pos) && *pos != '/' &&
+ *pos != '?' && *pos != ':' && !is_url_end (*pos)) {
+ return FALSE;
}
}
- else if (rc != URI_ERRNO_OK) {
- msg_info_task ("extract of url '%s' failed: %s",
- url_str,
- rspamd_url_strerror (rc));
- }
+ }
+ else {
+ return FALSE;
}
}
}
- else {
- break;
- }
- p = url_end + 1;
- }
- /* Handle offsets of this part */
- if (part->urls_offset != NULL) {
- part->urls_offset = g_list_reverse (part->urls_offset);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
}
+
+ return TRUE;
}
static gint
-rspamd_url_trie_callback (int strnum, int textpos, void *context)
+rspamd_url_trie_callback (struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
{
struct url_matcher *matcher;
url_match_t m;
const gchar *pos;
struct url_callback_data *cb = context;
- ac_trie_pat_t *pat;
matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
strnum);
+
if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
- if (matcher->flags & URL_FLAG_TLD_MATCH) {
- /* Immediately check pos for valid chars */
- pos = &cb->begin[textpos];
- if (pos < cb->end) {
- if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' &&
- *pos != ':' && !is_url_end (*pos)) {
- if (*pos == '.') {
- /* We allow . at the end of the domain however */
- pos++;
- if (pos < cb->end) {
- if (!g_ascii_isspace (*pos) && *pos != '/' &&
- *pos != '?' && *pos != ':' && !is_url_end (*pos)) {
- return 0;
- }
- }
- }
- else {
- return 0;
- }
- }
- }
- }
+ pos = &cb->begin[match_pos];
- pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
+ if (!rspamd_url_trie_is_match (matcher, pos, cb->end)) {
+ return 0;
+ }
m.pattern = matcher->pattern;
m.prefix = matcher->prefix;
m.add_prefix = FALSE;
- pos = cb->begin + textpos - pat->len;
+ pos = cb->begin + match_start;
if (matcher->start (cb, pos, &m) &&
matcher->end (cb, pos, &m)) {
@@ -2155,14 +2073,11 @@ gboolean
rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin,
gsize len,
- const gchar **start,
- const gchar **fin,
gchar **url_str,
- gboolean is_html,
- gint *statep)
+ gboolean is_html)
{
struct url_callback_data cb;
- gint ret, state;
+ gint ret;
memset (&cb, 0, sizeof (cb));
cb.begin = begin;
@@ -2170,27 +2085,10 @@ rspamd_url_find (rspamd_mempool_t *pool,
cb.is_html = is_html;
cb.pool = pool;
- if (statep != NULL) {
- state = *statep;
- }
- else {
- state = 0;
- }
-
- ret = acism_lookup (url_scanner->search_trie, begin, len,
- rspamd_url_trie_callback, &cb, &state, true);
-
- if (statep) {
- *statep = state;
- }
+ ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
+ rspamd_url_trie_callback, &cb, NULL);
if (ret) {
- if (start) {
- *start = cb.start;
- }
- if (fin) {
- *fin = cb.fin;
- }
if (url_str) {
*url_str = cb.url_str;
}
@@ -2201,67 +2099,188 @@ rspamd_url_find (rspamd_mempool_t *pool,
return FALSE;
}
-struct rspamd_url *
-rspamd_url_get_next (rspamd_mempool_t *pool,
- const gchar *start, gchar const **pos, gint *statep)
+static gint
+rspamd_url_trie_generic_callback (struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
{
- const gchar *p, *end, *url_start, *url_end;
- gchar *url_str = NULL;
- struct rspamd_url *new;
+ struct rspamd_url *url;
+ struct url_matcher *matcher;
+ url_match_t m;
+ const gchar *pos;
+ struct url_callback_data *cb = context;
gint rc;
+ rspamd_mempool_t *pool;
+
+ matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
+ strnum);
+ pool = cb->pool;
- end = start + strlen (start);
+ if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+ /* Do not try to match non-html like urls in html texts */
+ return 0;
+ }
+
+ pos = text + match_pos;
+
+ if (!rspamd_url_trie_is_match (matcher, pos, text + len)) {
+ return 0;
+ }
+
+ m.pattern = matcher->pattern;
+ m.prefix = matcher->prefix;
+ m.add_prefix = FALSE;
+
+ if (matcher->start (cb, pos, &m) &&
+ matcher->end (cb, pos, &m)) {
+ if (m.add_prefix || matcher->prefix[0] != '\0') {
+ cb->len = m.m_len + strlen (matcher->prefix);
+ cb->url_str = rspamd_mempool_alloc (cb->pool, cb->len + 1);
+ cb->len = rspamd_snprintf (cb->url_str,
+ cb->len + 1,
+ "%s%*s",
+ m.prefix,
+ (gint)m.m_len,
+ m.m_begin);
+ }
+ else {
+ cb->url_str = rspamd_mempool_alloc (cb->pool, m.m_len + 1);
+ rspamd_strlcpy (cb->url_str, m.m_begin, m.m_len + 1);
+ }
- if (pos == NULL || *pos == NULL) {
- p = start;
+ cb->start = m.m_begin;
+ cb->fin = m.m_begin + m.m_len;
+ url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
+ g_strstrip (cb->url_str);
+ rc = rspamd_url_parse (url, cb->url_str, m.m_len + 1, pool);
+
+ if (rc == URI_ERRNO_OK && url->hostlen > 0) {
+ if (cb->func) {
+ cb->func (url, cb->start - text, cb->fin - text, cb->funcd);
+ }
+ }
+ else if (rc != URI_ERRNO_OK) {
+ msg_info_pool_check ("extract of url '%s' failed: %s",
+ cb->url_str,
+ rspamd_url_strerror (rc));
+ }
}
else {
- p = *pos;
+ cb->url_str = NULL;
}
- if (p < end) {
- if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
- FALSE, statep)) {
- if (url_str != NULL) {
- new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
+ /* Continue search */
+ return 0;
+}
- if (new != NULL) {
- g_strstrip (url_str);
- rc = rspamd_url_parse (new, url_str, strlen (url_str),
- pool);
- if (rc == URI_ERRNO_OK &&
- new->hostlen > 0) {
+struct rspamd_url_mimepart_cbdata {
+ struct rspamd_task *task;
+ struct mime_text_part *part;
+};
- if (new->protocol == PROTOCOL_MAILTO) {
- if (new->userlen > 0) {
- return new;
- }
- }
- else {
- return new;
- }
- }
- else if (rc != URI_ERRNO_OK) {
- rspamd_default_log_function (G_LOG_LEVEL_INFO,
- pool->tag.tagname, pool->tag.uid,
- G_STRFUNC,
- "extract of url '%s' failed: %s",
- url_str,
- rspamd_url_strerror (rc));
- }
+static void
+rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ struct rspamd_url_mimepart_cbdata *cbd = ud;
+ struct process_exception *ex;
+ struct rspamd_task *task;
+ gchar *url_str = NULL;
+ struct rspamd_url *query_url;
+ gint rc;
+
+ task = cbd->task;
+ ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+
+ ex->pos = start_offset;
+ ex->len = end_offset - start_offset;
+
+ if (url->protocol == PROTOCOL_MAILTO) {
+ if (url->userlen > 0) {
+ if (!g_hash_table_lookup (task->emails, url)) {
+ g_hash_table_insert (task->emails, url,
+ url);
+ }
+ }
+ }
+ else {
+ if (!g_hash_table_lookup (task->urls, url)) {
+ g_hash_table_insert (task->urls, url, url);
+ }
+ }
+
+ cbd->part->urls_offset = g_list_prepend (
+ cbd->part->urls_offset,
+ ex);
+
+ /* We also search the query for additional url inside */
+ if (url->querylen > 0) {
+ if (rspamd_url_find (task->task_pool,
+ url->query,
+ url->querylen,
+ &url_str,
+ IS_PART_HTML (cbd->part))) {
+
+ query_url = rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (struct rspamd_url));
+ rc = rspamd_url_parse (query_url,
+ url_str,
+ strlen (url_str),
+ task->task_pool);
+
+ if (rc == URI_ERRNO_OK &&
+ url->hostlen > 0) {
+ msg_debug_task ("found url %s in query of url"
+ " %*s", url_str, url->querylen, url->query);
+
+ if (!g_hash_table_lookup (task->urls,
+ query_url)) {
+ g_hash_table_insert (task->urls,
+ query_url,
+ query_url);
}
}
}
- p = url_end + 1;
+ }
+}
- if (pos != NULL) {
- *pos = p;
- }
+void
+rspamd_url_text_extract (rspamd_mempool_t *pool,
+ struct rspamd_task *task,
+ struct mime_text_part *part,
+ gboolean is_html)
+{
+ struct url_callback_data cb;
+ struct rspamd_url_mimepart_cbdata mcbd;
+
+ if (part->content == NULL || part->content->len == 0) {
+ msg_warn_task ("got empty text part");
+ return;
}
- return NULL;
-}
+ memset (&cb, 0, sizeof (cb));
+ cb.begin = part->content->data;
+ cb.end = part->content->data + part->content->len;
+ cb.is_html = is_html;
+ cb.pool = pool;
-/*
- * vi: ts=4
- */
+ mcbd.task = task;
+ mcbd.part = part;
+ cb.funcd = &mcbd;
+ cb.func = rspamd_url_text_part_callback;
+
+ rspamd_multipattern_lookup (url_scanner->search_trie, cb.begin,
+ part->content->len,
+ rspamd_url_trie_generic_callback, &cb, NULL);
+
+ /* Handle offsets of this part */
+ if (part->urls_offset != NULL) {
+ part->urls_offset = g_list_reverse (part->urls_offset);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+ }
+}
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 7dfcb05af..3af11d638 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -106,26 +106,13 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
gboolean rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin,
gsize len,
- const gchar **start,
- const gchar **end,
gchar **url_str,
- gboolean is_html,
- gint *statep);
+ gboolean is_html);
/*
* Return text representation of url parsing error
*/
const gchar * rspamd_url_strerror (enum uri_errno err);
-/**
- * Convenience routine to extract urls from an arbitrarty text
- * @param pool
- * @param start
- * @param pos
- * @return url or NULL
- */
-struct rspamd_url *
-rspamd_url_get_next (rspamd_mempool_t *pool,
- const gchar *start, gchar const **pos, gint *statep);
/**
* Find TLD for a specified host string
@@ -136,4 +123,7 @@ rspamd_url_get_next (rspamd_mempool_t *pool,
*/
gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
+typedef void (*url_insert_function) (struct rspamd_url *url,
+ gsize start_offset, gsize end_offset, void *ud);
+
#endif