From 5cb7a60c653914d09fd7a83bf4838a97b0d664d5 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 7 Mar 2020 16:32:14 +0000 Subject: [PATCH] [Rework] Urls: adopt html related stuff --- src/libmime/message.c | 3 +- src/libserver/html.c | 142 +++++++++++++----------------------------- src/libserver/html.h | 5 +- src/libserver/url.c | 58 +++++++++++++---- src/libserver/url.h | 9 +++ 5 files changed, 103 insertions(+), 114 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 40b7fe8bc..c45550e6d 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -758,8 +758,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task, text_part->html, text_part->utf_raw_content, &text_part->exceptions, - MESSAGE_FIELD (task, urls), - MESSAGE_FIELD (task, emails)); + MESSAGE_FIELD (task, urls)); if (text_part->utf_content->len == 0) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; diff --git a/src/libserver/html.c b/src/libserver/html.c index 981141ad8..9b5282575 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1617,10 +1617,9 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag, static void rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, - GHashTable *tbl_urls, GHashTable *tbl_emails) + khash_t (rspamd_url_hash) *url_set) { - GHashTable *target_tbl; - struct rspamd_url *query_url, *existing; + struct rspamd_url *query_url; gchar *url_str; gint rc; gboolean prefix_added; @@ -1648,13 +1647,6 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, msg_debug_html ("found url %s in query of url" " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url)); - if (query_url->protocol == PROTOCOL_MAILTO) { - target_tbl = tbl_emails; - } - else { - target_tbl = tbl_urls; - } - if (prefix_added) { query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; } @@ -1671,15 +1663,7 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, query_url->flags |= RSPAMD_URL_FLAG_OBSCURED; } - if ((existing = g_hash_table_lookup (target_tbl, - query_url)) == NULL) { - g_hash_table_insert (target_tbl, - query_url, - query_url); - } - else { - existing->count ++; - } + rspamd_url_set_add_or_increase (url_set, query_url); } } } @@ -1739,7 +1723,7 @@ rspamd_html_process_data_image (rspamd_mempool_t *pool, static void rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag, - struct html_content *hc, GHashTable *urls) + struct html_content *hc, khash_t (rspamd_url_hash) *url_set) { struct html_tag_component *comp; struct html_image *img; @@ -1784,17 +1768,8 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag, img->src, fstr.len, NULL); if (img->url) { - struct rspamd_url *turl = g_hash_table_lookup (urls, - img->url); - img->url->flags |= RSPAMD_URL_FLAG_IMAGE; - - if (turl == NULL) { - g_hash_table_insert (urls, img->url, img->url); - } - else { - turl->count++; - } + rspamd_url_set_add_or_increase (url_set, img->url); } } } @@ -2449,10 +2424,11 @@ rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag, static void rspamd_html_check_displayed_url (rspamd_mempool_t *pool, - GList **exceptions, GHashTable *urls, GHashTable *emails, - GByteArray *dest, GHashTable *target_tbl, - gint href_offset, - struct rspamd_url *url) + GList **exceptions, + khash_t (rspamd_url_hash) *url_set, + GByteArray *dest, + gint href_offset, + struct rspamd_url *url) { struct rspamd_url *displayed_url = NULL; struct rspamd_url *turl; @@ -2477,6 +2453,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, if (url_found) { url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL; } + if (exceptions && url_found) { ex = rspamd_mempool_alloc (pool, sizeof (*ex)); @@ -2489,39 +2466,27 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, ex); } - if (displayed_url) { - if (displayed_url->protocol == - PROTOCOL_MAILTO) { - target_tbl = emails; - } - else { - target_tbl = urls; - } + if (displayed_url && url_set) { + turl = rspamd_url_set_add_or_return (url_set, + displayed_url); - if (target_tbl != NULL) { - turl = g_hash_table_lookup (target_tbl, - displayed_url); - - if (turl != NULL) { - /* Here, we assume the following: - * if we have a URL in the text part which - * is the same as displayed URL in the - * HTML part, we assume that it is also - * hint only. - */ - if (turl->flags & - RSPAMD_URL_FLAG_FROM_TEXT) { - turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; - turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; - } - - turl->count ++; - } - else { - g_hash_table_insert (target_tbl, - displayed_url, - displayed_url); + if (turl != NULL) { + /* Here, we assume the following: + * if we have a URL in the text part which + * is the same as displayed URL in the + * HTML part, we assume that it is also + * hint only. + */ + if (turl->flags & + RSPAMD_URL_FLAG_FROM_TEXT) { + turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; } + + turl->count ++; + } + else { + /* Already inserted by `rspamd_url_set_add_or_return` */ } } } @@ -2625,20 +2590,22 @@ rspamd_html_propagate_style (struct html_content *hc, } GByteArray* -rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, - GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails) +rspamd_html_process_part_full (rspamd_mempool_t *pool, + struct html_content *hc, + GByteArray *in, + GList **exceptions, + khash_t (rspamd_url_hash) *url_set) { const guchar *p, *c, *end, *savep = NULL; guchar t; gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE, balanced; GByteArray *dest; - GHashTable *target_tbl; guint obrace = 0, ebrace = 0; GNode *cur_level = NULL; gint substate = 0, len, href_offset = -1; struct html_tag *cur_tag = NULL, *content_tag = NULL; - struct rspamd_url *url = NULL, *turl; + struct rspamd_url *url = NULL; GQueue *styles_blocks; enum { @@ -3089,28 +3056,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, if (url != NULL) { - if (url->protocol == PROTOCOL_MAILTO) { - target_tbl = emails; - } - else { - target_tbl = urls; - } - - if (target_tbl != NULL) { - turl = g_hash_table_lookup (target_tbl, url); - - if (turl == NULL) { - g_hash_table_insert (target_tbl, url, url); - } - else { - turl->count ++; - url = NULL; - } - - if (turl == NULL && url != NULL) { - rspamd_process_html_url (pool, - url, - urls, emails); + if (url_set != NULL) { + if (!rspamd_url_set_add_or_increase (url_set, url)) { + rspamd_process_html_url (pool, url, url_set); } } @@ -3131,8 +3079,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, prev_url = prev_tag->extra; rspamd_html_check_displayed_url (pool, - exceptions, urls, emails, - dest, target_tbl, href_offset, + exceptions, url_set, + dest, href_offset, prev_url); } } @@ -3142,8 +3090,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, /* Insert exception */ if (url != NULL && (gint) dest->len > href_offset) { rspamd_html_check_displayed_url (pool, - exceptions, urls, emails, - dest, target_tbl, href_offset, + exceptions, url_set, + dest, href_offset, url); } @@ -3172,7 +3120,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, } if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { - rspamd_html_process_img_tag (pool, cur_tag, hc, urls); + rspamd_html_process_img_tag (pool, cur_tag, hc, url_set); } else if (cur_tag->flags & FL_BLOCK) { struct html_block *bl; @@ -3237,5 +3185,5 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in) { - return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL); + return rspamd_html_process_part_full (pool, hc, in, NULL, NULL); } diff --git a/src/libserver/html.h b/src/libserver/html.h index ee5c242cb..051df5b63 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -6,7 +6,8 @@ #define RSPAMD_HTML_H #include "config.h" -#include "mem_pool.h" +#include "libutil/mem_pool.h" +#include "libserver/url.h" #ifdef __cplusplus extern "C" { @@ -142,7 +143,7 @@ GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool, GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in, GList **exceptions, - GHashTable *urls, GHashTable *emails); + khash_t (rspamd_url_hash) *url_set); /* * Returns true if a specified tag has been seen in a part diff --git a/src/libserver/url.c b/src/libserver/url.c index 505d1d150..39ea5acc2 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -3297,7 +3297,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, { struct rspamd_task *task = ud; gchar *url_str = NULL; - struct rspamd_url *query_url, *existing; + struct rspamd_url *query_url; gint rc; gboolean prefix_added; @@ -3781,6 +3781,26 @@ rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set, return true; } +struct rspamd_url * +rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set, + struct rspamd_url *u) +{ + khiter_t k; + gint r; + + if (set) { + k = kh_put (rspamd_url_hash, set, u, &r); + + if (r == 0) { + struct rspamd_url *ex = kh_key (set, k); + + return ex; + } + } + + return NULL; +} + bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u) @@ -3788,13 +3808,17 @@ rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set, khiter_t k; gint r; - k = kh_put (rspamd_url_host_hash, set, u, &r); + if (set) { + k = kh_put (rspamd_url_host_hash, set, u, &r); - if (r == 0) { - return false; + if (r == 0) { + return false; + } + + return true; } - return true; + return false; } bool @@ -3802,13 +3826,17 @@ rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u) { khiter_t k; - k = kh_get (rspamd_url_hash, set, u); + if (set) { + k = kh_get (rspamd_url_hash, set, u); - if (k == kh_end (set)) { - return false; + if (k == kh_end (set)) { + return false; + } + + return true; } - return true; + return false; } bool @@ -3816,11 +3844,15 @@ rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url { khiter_t k; - k = kh_get (rspamd_url_hash, set, u); + if (set) { + k = kh_get (rspamd_url_host_hash, set, u); - if (k == kh_end (set)) { - return false; + if (k == kh_end (set)) { + return false; + } + + return true; } - return true; + return false; } \ No newline at end of file diff --git a/src/libserver/url.h b/src/libserver/url.h index aff7ccf5f..bf8ba4b63 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -280,6 +280,15 @@ KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char); */ bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set, struct rspamd_url *u); + +/** + * Same as rspamd_url_set_add_or_increase but returns the existing url if found + * @param set + * @param u + * @return + */ +struct rspamd_url * rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set, + struct rspamd_url *u); /** * Helper for url host set * @param set -- 2.39.5