From 40e894b9dfda24c8b454bf2365905d517e8e27a3 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 2 Apr 2019 11:07:53 +0100 Subject: [PATCH] [Rework] Rework HTML content urls extraction --- src/libmime/message.c | 10 ++++-- src/libserver/html.c | 6 ++-- src/libserver/url.c | 80 ++++++++++++++++++++++++------------------- src/libserver/url.h | 59 ++++++++++++++++++------------- src/lua/lua_url.c | 5 +-- 5 files changed, 96 insertions(+), 64 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index cca134f81..6825bc2f0 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -912,7 +912,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, rspamd_normalize_text_part (task, text_part); if (!IS_PART_HTML (text_part)) { - rspamd_url_text_extract (task->task_pool, task, text_part, FALSE); + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_ALL); + } + else { + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); } if (text_part->exceptions) { @@ -1231,7 +1236,8 @@ rspamd_message_parse (struct rspamd_task *task) p = task->subject; len = strlen (p); rspamd_cryptobox_hash_update (&st, p, len); - rspamd_url_find_multiple (task->task_pool, p, len, FALSE, NULL, + rspamd_url_find_multiple (task->task_pool, p, len, + RSPAMD_URL_FIND_STRICT, NULL, rspamd_url_task_subject_callback, task); } diff --git a/src/libserver/html.c b/src/libserver/html.c index 6df545f00..41925609e 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -598,7 +598,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool, } if (end > url_text + 4 && - rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE, + rspamd_url_find (pool, url_text, end - url_text, &url_str, + RSPAMD_URL_FIND_ALL, &url_pos, NULL) && url_str != NULL) { if (url_pos > 0) { @@ -1569,7 +1570,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, if (url->querylen > 0) { - if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE, + if (rspamd_url_find (pool, url->query, url->querylen, &url_str, + RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); diff --git a/src/libserver/url.c b/src/libserver/url.c index f0f5bb21b..d774eb440 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -205,7 +205,7 @@ struct url_matcher static_matchers[] = { {"sip:", "", url_web_start, url_web_end, 0, 0}, {"www.", "http://", url_web_start, url_web_end, - 0, 0}, + URL_FLAG_NOHTML, 0}, {"ftp.", "ftp://", url_web_start, url_web_end, URL_FLAG_NOHTML, 0}, /* Likely emails */ @@ -218,7 +218,7 @@ struct url_callback_data { gchar *url_str; rspamd_mempool_t *pool; gint len; - gboolean is_html; + enum rspamd_url_find_type how; gboolean prefix_added; guint newline_idx; GPtrArray *newlines; @@ -2584,12 +2584,12 @@ rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos, static gint rspamd_url_trie_callback (struct rspamd_multipattern *mp, - guint strnum, - gint match_start, - gint match_pos, - const gchar *text, - gsize len, - void *context) + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) { struct url_matcher *matcher; url_match_t m; @@ -2599,7 +2599,7 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp, matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum); - if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) { + if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { /* Do not try to match non-html like urls in html texts */ return 0; } @@ -2669,9 +2669,12 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp, } gboolean -rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, - gchar **url_str, gboolean is_html, goffset *url_pos, - gboolean *prefix_added) +rspamd_url_find (rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added) { struct url_callback_data cb; gint ret; @@ -2679,7 +2682,7 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, memset (&cb, 0, sizeof (cb)); cb.begin = begin; cb.end = begin + len; - cb.is_html = is_html; + cb.how = how; cb.pool = pool; ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len, @@ -2706,13 +2709,13 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, static gint rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp, - guint strnum, - gint match_start, - gint match_pos, - const gchar *text, - gsize len, - void *context, - gboolean multiple) + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context, + gboolean multiple) { struct rspamd_url *url; struct url_matcher *matcher; @@ -2726,7 +2729,7 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp, strnum); pool = cb->pool; - if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) { + if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { /* Do not try to match non-html like urls in html texts */ return 0; } @@ -2894,7 +2897,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, /* We also search the query for additional url inside */ if (url->querylen > 0) { if (rspamd_url_find (task->task_pool, url->query, url->querylen, - &url_str, IS_PART_HTML (cbd->part), NULL, &prefix_added)) { + &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); @@ -2938,9 +2941,9 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, void rspamd_url_text_extract (rspamd_mempool_t *pool, - struct rspamd_task *task, - struct rspamd_mime_text_part *part, - gboolean is_html) + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + enum rspamd_url_find_type how) { struct rspamd_url_mimepart_cbdata mcbd; @@ -2953,14 +2956,18 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, mcbd.part = part; rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data, - part->utf_stripped_content->len, is_html, part->newlines, + part->utf_stripped_content->len, how, part->newlines, rspamd_url_text_part_callback, &mcbd); } void -rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, - gsize inlen, gboolean is_html, GPtrArray *nlines, - url_insert_function func, gpointer ud) +rspamd_url_find_multiple (rspamd_mempool_t *pool, + const gchar *in, + gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud) { struct url_callback_data cb; @@ -2973,7 +2980,7 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, memset (&cb, 0, sizeof (cb)); cb.begin = in; cb.end = in + inlen; - cb.is_html = is_html; + cb.how = how; cb.pool = pool; cb.funcd = ud; @@ -2986,9 +2993,12 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, } void -rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, - gsize inlen, gboolean is_html, - url_insert_function func, gpointer ud) +rspamd_url_find_single (rspamd_mempool_t *pool, + const gchar *in, + gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud) { struct url_callback_data cb; @@ -3001,7 +3011,7 @@ rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, memset (&cb, 0, sizeof (cb)); cb.begin = in; cb.end = in + inlen; - cb.is_html = is_html; + cb.how = how; cb.pool = pool; cb.funcd = ud; @@ -3049,7 +3059,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, /* We also search the query for additional url inside */ if (url->querylen > 0) { if (rspamd_url_find (task->task_pool, url->query, url->querylen, - &url_str, FALSE, NULL, &prefix_added)) { + &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); diff --git a/src/libserver/url.h b/src/libserver/url.h index 2cf80df4b..2243534dc 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -90,6 +90,17 @@ enum rspamd_url_protocol { PROTOCOL_UNKNOWN = 1u << 31, }; +enum rspamd_url_parse_flags { + RSPAMD_URL_PARSE_TEXT = 0, + RSPAMD_URL_PARSE_HREF = (1u << 0), + RSPAMD_URL_PARSE_CHECK = (1 << 1), +}; + +enum rspamd_url_find_type { + RSPAMD_URL_FIND_ALL = 0, + RSPAMD_URL_FIND_STRICT, +}; + /** * Initialize url library * @param cfg @@ -104,15 +115,9 @@ void rspamd_url_deinit (void); * @param is_html turn on html euristic */ void rspamd_url_text_extract (rspamd_mempool_t *pool, - struct rspamd_task *task, - struct rspamd_mime_text_part *part, - gboolean is_html); - -enum rspamd_url_parse_flags { - RSPAMD_URL_PARSE_TEXT = 0, - RSPAMD_URL_PARSE_HREF = (1u << 0), - RSPAMD_URL_PARSE_CHECK = (1 << 1), -}; + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + enum rspamd_url_find_type how); /* * Parse a single url into an uri structure @@ -136,9 +141,12 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri, * @param url_str storage for url string(or NULL) * @return TRUE if url is found in specified text */ -gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, - gchar **url_str, gboolean is_html, goffset *url_pos, - gboolean *prefix_added); +gboolean rspamd_url_find (rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added); /* * Return text representation of url parsing error */ @@ -166,9 +174,12 @@ typedef void (*url_insert_function) (struct rspamd_url *url, * @param func * @param ud */ -void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, - gsize inlen, gboolean is_html, GPtrArray *nlines, - url_insert_function func, gpointer ud); +void rspamd_url_find_multiple (rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud); /** * Search for a single url in text and call `func` for each url found * @param pool @@ -178,9 +189,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in, * @param func * @param ud */ -void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, - gsize inlen, gboolean is_html, - url_insert_function func, gpointer ud); +void rspamd_url_find_single (rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud); /** * Generic callback to insert URLs into rspamd_task @@ -190,8 +203,8 @@ void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in, * @param ud */ void rspamd_url_task_subject_callback (struct rspamd_url *url, - gsize start_offset, - gsize end_offset, gpointer ud); + gsize start_offset, + gsize end_offset, gpointer ud); /** * Adds a tag for url @@ -200,8 +213,8 @@ void rspamd_url_task_subject_callback (struct rspamd_url *url, * @param pool */ void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag, - const gchar *value, - rspamd_mempool_t *pool); + const gchar *value, + rspamd_mempool_t *pool); guint rspamd_url_hash (gconstpointer u); guint rspamd_email_hash (gconstpointer u); @@ -232,7 +245,7 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size); * @return */ const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen, - rspamd_mempool_t *pool); + rspamd_mempool_t *pool); /** diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c index 9bc984da3..a0f8c4648 100644 --- a/src/lua/lua_url.c +++ b/src/lua/lua_url.c @@ -799,7 +799,7 @@ lua_url_create (lua_State *L) return luaL_error (L, "invalid arguments"); } else { - rspamd_url_find_single (pool, text, length, FALSE, + rspamd_url_find_single (pool, text, length, RSPAMD_URL_FIND_ALL, lua_url_single_inserter, L); if (lua_type (L, -1) != LUA_TUSERDATA) { @@ -867,7 +867,8 @@ lua_url_all (lua_State *L) if (text != NULL) { lua_newtable (L); - rspamd_url_find_multiple (pool, text, length, FALSE, NULL, + rspamd_url_find_multiple (pool, text, length, + RSPAMD_URL_FIND_ALL, NULL, lua_url_table_inserter, L); } -- 2.39.5