@@ -912,7 +912,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, | |||
rspamd_normalize_text_part (task, text_part); | |||
if (!IS_PART_HTML (text_part)) { | |||
rspamd_url_text_extract (task->task_pool, task, text_part, FALSE); | |||
rspamd_url_text_extract (task->task_pool, task, text_part, | |||
RSPAMD_URL_FIND_ALL); | |||
} | |||
else { | |||
rspamd_url_text_extract (task->task_pool, task, text_part, | |||
RSPAMD_URL_FIND_STRICT); | |||
} | |||
if (text_part->exceptions) { | |||
@@ -1231,7 +1236,8 @@ rspamd_message_parse (struct rspamd_task *task) | |||
p = task->subject; | |||
len = strlen (p); | |||
rspamd_cryptobox_hash_update (&st, p, len); | |||
rspamd_url_find_multiple (task->task_pool, p, len, FALSE, NULL, | |||
rspamd_url_find_multiple (task->task_pool, p, len, | |||
RSPAMD_URL_FIND_STRICT, NULL, | |||
rspamd_url_task_subject_callback, task); | |||
} | |||
@@ -598,7 +598,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool, | |||
} | |||
if (end > url_text + 4 && | |||
rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE, | |||
rspamd_url_find (pool, url_text, end - url_text, &url_str, | |||
RSPAMD_URL_FIND_ALL, | |||
&url_pos, NULL) && | |||
url_str != NULL) { | |||
if (url_pos > 0) { | |||
@@ -1569,7 +1570,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, | |||
if (url->querylen > 0) { | |||
if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE, | |||
if (rspamd_url_find (pool, url->query, url->querylen, &url_str, | |||
RSPAMD_URL_FIND_ALL, | |||
NULL, &prefix_added)) { | |||
query_url = rspamd_mempool_alloc0 (pool, | |||
sizeof (struct rspamd_url)); |
@@ -205,7 +205,7 @@ struct url_matcher static_matchers[] = { | |||
{"sip:", "", url_web_start, url_web_end, | |||
0, 0}, | |||
{"www.", "http://", url_web_start, url_web_end, | |||
0, 0}, | |||
URL_FLAG_NOHTML, 0}, | |||
{"ftp.", "ftp://", url_web_start, url_web_end, | |||
URL_FLAG_NOHTML, 0}, | |||
/* Likely emails */ | |||
@@ -218,7 +218,7 @@ struct url_callback_data { | |||
gchar *url_str; | |||
rspamd_mempool_t *pool; | |||
gint len; | |||
gboolean is_html; | |||
enum rspamd_url_find_type how; | |||
gboolean prefix_added; | |||
guint newline_idx; | |||
GPtrArray *newlines; | |||
@@ -2584,12 +2584,12 @@ rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos, | |||
static gint | |||
rspamd_url_trie_callback (struct rspamd_multipattern *mp, | |||
guint strnum, | |||
gint match_start, | |||
gint match_pos, | |||
const gchar *text, | |||
gsize len, | |||
void *context) | |||
guint strnum, | |||
gint match_start, | |||
gint match_pos, | |||
const gchar *text, | |||
gsize len, | |||
void *context) | |||
{ | |||
struct url_matcher *matcher; | |||
url_match_t m; | |||
@@ -2599,7 +2599,7 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp, | |||
matcher = &g_array_index (url_scanner->matchers, struct url_matcher, | |||
strnum); | |||
if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) { | |||
if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { | |||
/* Do not try to match non-html like urls in html texts */ | |||
return 0; | |||
} | |||
@@ -2669,9 +2669,12 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp, | |||
} | |||
gboolean | |||
rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, | |||
gchar **url_str, gboolean is_html, goffset *url_pos, | |||
gboolean *prefix_added) | |||
rspamd_url_find (rspamd_mempool_t *pool, | |||
const gchar *begin, gsize len, | |||
gchar **url_str, | |||
enum rspamd_url_find_type how, | |||
goffset *url_pos, | |||
gboolean *prefix_added) | |||
{ | |||
struct url_callback_data cb; | |||
gint ret; | |||
@@ -2679,7 +2682,7 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, | |||
memset (&cb, 0, sizeof (cb)); | |||
cb.begin = begin; | |||
cb.end = begin + len; | |||
cb.is_html = is_html; | |||
cb.how = how; | |||
cb.pool = pool; | |||
ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len, | |||
@@ -2706,13 +2709,13 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, | |||
static gint | |||
rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp, | |||
guint strnum, | |||
gint match_start, | |||
gint match_pos, | |||
const gchar *text, | |||
gsize len, | |||
void *context, | |||
gboolean multiple) | |||
guint strnum, | |||
gint match_start, | |||
gint match_pos, | |||
const gchar *text, | |||
gsize len, | |||
void *context, | |||
gboolean multiple) | |||
{ | |||
struct rspamd_url *url; | |||
struct url_matcher *matcher; | |||
@@ -2726,7 +2729,7 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp, | |||
strnum); | |||
pool = cb->pool; | |||
if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) { | |||
if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { | |||
/* Do not try to match non-html like urls in html texts */ | |||
return 0; | |||
} | |||
@@ -2894,7 +2897,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, | |||
/* We also search the query for additional url inside */ | |||
if (url->querylen > 0) { | |||
if (rspamd_url_find (task->task_pool, url->query, url->querylen, | |||
&url_str, IS_PART_HTML (cbd->part), NULL, &prefix_added)) { | |||
&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { | |||
query_url = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (struct rspamd_url)); | |||
@@ -2938,9 +2941,9 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, | |||
void | |||
rspamd_url_text_extract (rspamd_mempool_t *pool, | |||
struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part, | |||
gboolean is_html) | |||
struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part, | |||
enum rspamd_url_find_type how) | |||
{ | |||
struct rspamd_url_mimepart_cbdata mcbd; | |||
@@ -2953,14 +2956,18 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, | |||
mcbd.part = part; | |||
rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data, | |||
part->utf_stripped_content->len, is_html, part->newlines, | |||
part->utf_stripped_content->len, how, part->newlines, | |||
rspamd_url_text_part_callback, &mcbd); | |||
} | |||
void | |||
rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, | |||
gsize inlen, gboolean is_html, GPtrArray *nlines, | |||
url_insert_function func, gpointer ud) | |||
rspamd_url_find_multiple (rspamd_mempool_t *pool, | |||
const gchar *in, | |||
gsize inlen, | |||
enum rspamd_url_find_type how, | |||
GPtrArray *nlines, | |||
url_insert_function func, | |||
gpointer ud) | |||
{ | |||
struct url_callback_data cb; | |||
@@ -2973,7 +2980,7 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, | |||
memset (&cb, 0, sizeof (cb)); | |||
cb.begin = in; | |||
cb.end = in + inlen; | |||
cb.is_html = is_html; | |||
cb.how = how; | |||
cb.pool = pool; | |||
cb.funcd = ud; | |||
@@ -2986,9 +2993,12 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, | |||
} | |||
void | |||
rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, | |||
gsize inlen, gboolean is_html, | |||
url_insert_function func, gpointer ud) | |||
rspamd_url_find_single (rspamd_mempool_t *pool, | |||
const gchar *in, | |||
gsize inlen, | |||
enum rspamd_url_find_type how, | |||
url_insert_function func, | |||
gpointer ud) | |||
{ | |||
struct url_callback_data cb; | |||
@@ -3001,7 +3011,7 @@ rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, | |||
memset (&cb, 0, sizeof (cb)); | |||
cb.begin = in; | |||
cb.end = in + inlen; | |||
cb.is_html = is_html; | |||
cb.how = how; | |||
cb.pool = pool; | |||
cb.funcd = ud; | |||
@@ -3049,7 +3059,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, | |||
/* We also search the query for additional url inside */ | |||
if (url->querylen > 0) { | |||
if (rspamd_url_find (task->task_pool, url->query, url->querylen, | |||
&url_str, FALSE, NULL, &prefix_added)) { | |||
&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { | |||
query_url = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (struct rspamd_url)); |
@@ -90,6 +90,17 @@ enum rspamd_url_protocol { | |||
PROTOCOL_UNKNOWN = 1u << 31, | |||
}; | |||
enum rspamd_url_parse_flags { | |||
RSPAMD_URL_PARSE_TEXT = 0, | |||
RSPAMD_URL_PARSE_HREF = (1u << 0), | |||
RSPAMD_URL_PARSE_CHECK = (1 << 1), | |||
}; | |||
enum rspamd_url_find_type { | |||
RSPAMD_URL_FIND_ALL = 0, | |||
RSPAMD_URL_FIND_STRICT, | |||
}; | |||
/** | |||
* Initialize url library | |||
* @param cfg | |||
@@ -104,15 +115,9 @@ void rspamd_url_deinit (void); | |||
* @param is_html turn on html euristic | |||
*/ | |||
void rspamd_url_text_extract (rspamd_mempool_t *pool, | |||
struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part, | |||
gboolean is_html); | |||
enum rspamd_url_parse_flags { | |||
RSPAMD_URL_PARSE_TEXT = 0, | |||
RSPAMD_URL_PARSE_HREF = (1u << 0), | |||
RSPAMD_URL_PARSE_CHECK = (1 << 1), | |||
}; | |||
struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part, | |||
enum rspamd_url_find_type how); | |||
/* | |||
* Parse a single url into an uri structure | |||
@@ -136,9 +141,12 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri, | |||
* @param url_str storage for url string(or NULL) | |||
* @return TRUE if url is found in specified text | |||
*/ | |||
gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, | |||
gchar **url_str, gboolean is_html, goffset *url_pos, | |||
gboolean *prefix_added); | |||
gboolean rspamd_url_find (rspamd_mempool_t *pool, | |||
const gchar *begin, gsize len, | |||
gchar **url_str, | |||
enum rspamd_url_find_type how, | |||
goffset *url_pos, | |||
gboolean *prefix_added); | |||
/* | |||
* Return text representation of url parsing error | |||
*/ | |||
@@ -166,9 +174,12 @@ typedef void (*url_insert_function) (struct rspamd_url *url, | |||
* @param func | |||
* @param ud | |||
*/ | |||
void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, | |||
gsize inlen, gboolean is_html, GPtrArray *nlines, | |||
url_insert_function func, gpointer ud); | |||
void rspamd_url_find_multiple (rspamd_mempool_t *pool, | |||
const gchar *in, gsize inlen, | |||
enum rspamd_url_find_type how, | |||
GPtrArray *nlines, | |||
url_insert_function func, | |||
gpointer ud); | |||
/** | |||
* Search for a single url in text and call `func` for each url found | |||
* @param pool | |||
@@ -178,9 +189,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, | |||
* @param func | |||
* @param ud | |||
*/ | |||
void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, | |||
gsize inlen, gboolean is_html, | |||
url_insert_function func, gpointer ud); | |||
void rspamd_url_find_single (rspamd_mempool_t *pool, | |||
const gchar *in, gsize inlen, | |||
enum rspamd_url_find_type how, | |||
url_insert_function func, | |||
gpointer ud); | |||
/** | |||
* Generic callback to insert URLs into rspamd_task | |||
@@ -190,8 +203,8 @@ void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, | |||
* @param ud | |||
*/ | |||
void rspamd_url_task_subject_callback (struct rspamd_url *url, | |||
gsize start_offset, | |||
gsize end_offset, gpointer ud); | |||
gsize start_offset, | |||
gsize end_offset, gpointer ud); | |||
/** | |||
* Adds a tag for url | |||
@@ -200,8 +213,8 @@ void rspamd_url_task_subject_callback (struct rspamd_url *url, | |||
* @param pool | |||
*/ | |||
void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag, | |||
const gchar *value, | |||
rspamd_mempool_t *pool); | |||
const gchar *value, | |||
rspamd_mempool_t *pool); | |||
guint rspamd_url_hash (gconstpointer u); | |||
guint rspamd_email_hash (gconstpointer u); | |||
@@ -232,7 +245,7 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size); | |||
* @return | |||
*/ | |||
const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen, | |||
rspamd_mempool_t *pool); | |||
rspamd_mempool_t *pool); | |||
/** |
@@ -799,7 +799,7 @@ lua_url_create (lua_State *L) | |||
return luaL_error (L, "invalid arguments"); | |||
} | |||
else { | |||
rspamd_url_find_single (pool, text, length, FALSE, | |||
rspamd_url_find_single (pool, text, length, RSPAMD_URL_FIND_ALL, | |||
lua_url_single_inserter, L); | |||
if (lua_type (L, -1) != LUA_TUSERDATA) { | |||
@@ -867,7 +867,8 @@ lua_url_all (lua_State *L) | |||
if (text != NULL) { | |||
lua_newtable (L); | |||
rspamd_url_find_multiple (pool, text, length, FALSE, NULL, | |||
rspamd_url_find_multiple (pool, text, length, | |||
RSPAMD_URL_FIND_ALL, NULL, | |||
lua_url_table_inserter, L); | |||
} |