diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-03-07 12:15:51 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-03-09 10:46:11 +0000 |
commit | 50a043a7cbce8142d81b7887d263a9573ff568eb (patch) | |
tree | 403d02dc57b10b9016696fed678332bc01756e44 | |
parent | c399a6013b8522fc28ed11839fae6cbe7062278a (diff) | |
download | rspamd-50a043a7cbce8142d81b7887d263a9573ff568eb.tar.gz rspamd-50a043a7cbce8142d81b7887d263a9573ff568eb.zip |
[Rework] Urls: more rework of the urls sets
-rw-r--r-- | src/libmime/message.c | 8 | ||||
-rw-r--r-- | src/libmime/message.h | 4 | ||||
-rw-r--r-- | src/libserver/html.h | 3 | ||||
-rw-r--r-- | src/libserver/protocol.c | 46 | ||||
-rw-r--r-- | src/libserver/re_cache.c | 24 | ||||
-rw-r--r-- | src/libserver/url.c | 191 | ||||
-rw-r--r-- | src/libserver/url.h | 24 |
7 files changed, 137 insertions, 163 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index a43e109b5..40b7fe8bc 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1048,8 +1048,7 @@ rspamd_message_dtor (struct rspamd_message *msg) g_ptr_array_unref (msg->text_parts); g_ptr_array_unref (msg->parts); - g_hash_table_unref (msg->urls); - g_hash_table_unref (msg->emails); + kh_destroy (rspamd_url_hash, msg->urls); } struct rspamd_message* @@ -1060,10 +1059,7 @@ rspamd_message_new (struct rspamd_task *task) msg = rspamd_mempool_alloc0 (task->task_pool, sizeof (*msg)); msg->raw_headers = rspamd_message_headers_new (); - - msg->emails = g_hash_table_new (rspamd_email_hash, rspamd_emails_cmp); - msg->urls = g_hash_table_new (rspamd_url_hash, rspamd_urls_cmp); - + msg->urls = kh_init (rspamd_url_hash); msg->parts = g_ptr_array_sized_new (4); msg->text_parts = g_ptr_array_sized_new (2); msg->task = task; diff --git a/src/libmime/message.h b/src/libmime/message.h index 91d6e13d4..96ed9d5d4 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -13,6 +13,7 @@ #include "libcryptobox/cryptobox.h" #include "libmime/mime_headers.h" #include "libmime/content_type.h" +#include "libserver/url.h" #include "libutil/ref.h" #include "libutil/str_util.h" @@ -175,8 +176,7 @@ struct rspamd_message { GPtrArray *text_parts; /**< list of text parts */ struct rspamd_message_raw_headers_content raw_headers_content; struct rspamd_received_header *received; /**< list of received headers */ - GHashTable *urls; /**< list of parsed urls */ - GHashTable *emails; /**< list of parsed emails */ + khash_t (rspamd_url_hash) *urls; struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */ struct rspamd_mime_header *headers_order; /**< order of raw headers */ struct rspamd_task *task; diff --git a/src/libserver/html.h b/src/libserver/html.h index b369bd890..ee5c242cb 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -141,7 +141,8 @@ GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool, GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, - GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails); + GByteArray *in, GList **exceptions, + GHashTable *urls, GHashTable *emails); /* * Returns true if a specified tag has been seen in a part diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index 739d3b950..35d50b909 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -861,7 +861,7 @@ rspamd_protocol_handle_request (struct rspamd_task *task, /* Structure for writing tree data */ struct tree_cb_data { ucl_object_t *top; - GHashTable *seen; + khash_t (rspamd_url_host_hash) *seen; struct rspamd_task *task; }; @@ -908,10 +908,8 @@ rspamd_protocol_extended_url (struct rspamd_task *task, * Callback for writing urls */ static void -urls_protocol_cb (gpointer key, gpointer value, gpointer ud) +urls_protocol_cb (struct rspamd_url *url, struct tree_cb_data *cb) { - struct tree_cb_data *cb = ud; - struct rspamd_url *url = value; ucl_object_t *obj; struct rspamd_task *task = cb->task; const gchar *user_field = "unknown", *encoded = NULL; @@ -921,7 +919,7 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) if (!(task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS)) { if (url->hostlen > 0) { - if (g_hash_table_lookup (cb->seen, url)) { + if (rspamd_url_host_set_has (cb->seen, url)) { return; } @@ -941,7 +939,7 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) return; } - g_hash_table_insert (cb->seen, url, url); + rspamd_url_host_set_add (cb->seen, url); } else { encoded = rspamd_url_encode (url, &enclen, task->task_pool); @@ -975,28 +973,32 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) } static ucl_object_t * -rspamd_urls_tree_ucl (GHashTable *input, struct rspamd_task *task) +rspamd_urls_tree_ucl (khash_t (rspamd_url_hash) *set, + struct rspamd_task *task) { struct tree_cb_data cb; ucl_object_t *obj; + struct rspamd_url *u; obj = ucl_object_typed_new (UCL_ARRAY); cb.top = obj; cb.task = task; - cb.seen = g_hash_table_new (rspamd_url_host_hash, rspamd_urls_host_cmp); + cb.seen = kh_init (rspamd_url_host_hash); - g_hash_table_foreach (input, urls_protocol_cb, &cb); + kh_foreach_key (set, u, { + if (!(u->protocol & PROTOCOL_MAILTO)) { + urls_protocol_cb (u, &cb); + } + }); - g_hash_table_unref (cb.seen); + kh_destroy (rspamd_url_host_hash, cb.seen); return obj; } static void -emails_protocol_cb (gpointer key, gpointer value, gpointer ud) +emails_protocol_cb (struct rspamd_url *url, struct tree_cb_data *cb) { - struct tree_cb_data *cb = ud; - struct rspamd_url *url = value; ucl_object_t *obj; if (url->userlen > 0 && url->hostlen > 0) { @@ -1007,16 +1009,23 @@ emails_protocol_cb (gpointer key, gpointer value, gpointer ud) } static ucl_object_t * -rspamd_emails_tree_ucl (GHashTable *input, struct rspamd_task *task) +rspamd_emails_tree_ucl (khash_t (rspamd_url_hash) *set, + struct rspamd_task *task) { struct tree_cb_data cb; ucl_object_t *obj; + struct rspamd_url *u; obj = ucl_object_typed_new (UCL_ARRAY); cb.top = obj; cb.task = task; - g_hash_table_foreach (input, emails_protocol_cb, &cb); + kh_foreach_key (set, u, { + if ((u->protocol & PROTOCOL_MAILTO)) { + emails_protocol_cb (u, &cb); + } + }); + return obj; } @@ -1446,15 +1455,12 @@ rspamd_protocol_write_ucl (struct rspamd_task *task, } if (flags & RSPAMD_PROTOCOL_URLS && task->message) { - if (g_hash_table_size (MESSAGE_FIELD (task, urls)) > 0) { + if (kh_size (MESSAGE_FIELD (task, urls)) > 0) { ucl_object_insert_key (top, rspamd_urls_tree_ucl (MESSAGE_FIELD (task, urls), task), "urls", 0, false); - } - - if (g_hash_table_size (MESSAGE_FIELD (task, emails)) > 0) { ucl_object_insert_key (top, - rspamd_emails_tree_ucl (MESSAGE_FIELD (task, emails), task), + rspamd_emails_tree_ucl (MESSAGE_FIELD (task, urls), task), "emails", 0, false); } } diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 995af8ddf..257428720 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1053,7 +1053,6 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, gboolean is_strong) { guint ret = 0, i, re_id; - GHashTableIter it; struct rspamd_mime_header *rh; const gchar *in; const guchar **scvec; @@ -1062,7 +1061,6 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_mime_text_part *text_part; struct rspamd_mime_part *mime_part; struct rspamd_url *url; - gpointer k, v; guint len, cnt; const gchar *class_name; @@ -1164,17 +1162,18 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, } break; case RSPAMD_RE_URL: - cnt = g_hash_table_size (MESSAGE_FIELD (task, urls)); + cnt = kh_size (MESSAGE_FIELD (task, urls)); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); - g_hash_table_iter_init (&it, MESSAGE_FIELD (task, urls)); i = 0; raw = FALSE; - while (g_hash_table_iter_next (&it, &k, &v)) { - url = v; + kh_foreach_key (MESSAGE_FIELD (task, urls), url, { + if ((url->protocol & PROTOCOL_MAILTO)) { + continue; + } in = url->string; len = url->urllen; @@ -1182,7 +1181,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, scvec[i] = (guchar *) in; lenvec[i++] = len; } - } + }); #if 0 g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails)); @@ -1207,18 +1206,19 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, } break; case RSPAMD_RE_EMAIL: - cnt = g_hash_table_size (MESSAGE_FIELD (task, emails)); + cnt = kh_size (MESSAGE_FIELD (task, urls)); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); - g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails)); i = 0; raw = FALSE; - while (g_hash_table_iter_next (&it, &k, &v)) { - url = v; + kh_foreach_key (MESSAGE_FIELD (task, urls), url, { + if (!(url->protocol & PROTOCOL_MAILTO)) { + continue; + } if (url->userlen == 0 || url->hostlen == 0) { continue; } @@ -1227,7 +1227,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, len = url->userlen + 1 + url->hostlen; scvec[i] = (guchar *) in; lenvec[i++] = len; - } + }); ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, i, raw, &processed_hyperscan); diff --git a/src/libserver/url.c b/src/libserver/url.c index 3449310b2..505d1d150 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -214,6 +214,13 @@ struct url_matcher static_matchers[] = { URL_FLAG_NOHTML} }; + +static inline khint_t rspamd_url_hash (struct rspamd_url *u); + +static inline khint_t rspamd_url_host_hash (struct rspamd_url * u); +static inline bool rspamd_urls_cmp (struct rspamd_url *a, struct rspamd_url *b); +static inline bool rspamd_urls_host_cmp (struct rspamd_url *a, struct rspamd_url *b); + /* Hash table implementation */ __KHASH_IMPL (rspamd_url_hash, kh_inline,struct rspamd_url *, char, false, rspamd_url_hash, rspamd_urls_cmp); @@ -3116,7 +3123,6 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, struct rspamd_task *task; gchar *url_str = NULL; struct rspamd_url *query_url, *existing; - GHashTable *target_tbl = NULL; gint rc; gboolean prefix_added; @@ -3141,36 +3147,23 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, } if (url->protocol == PROTOCOL_MAILTO) { - if (url->userlen > 0) { - target_tbl = MESSAGE_FIELD (task, emails); + if (url->userlen == 0) { + return FALSE; } } - else { - target_tbl = MESSAGE_FIELD (task, urls); - } - - if (target_tbl) { - /* Also check max urls */ - if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { - if (g_hash_table_size (target_tbl) > cbd->task->cfg->max_urls) { - msg_err_task ("part has too many URLs, we cannot process more: " - "%d urls extracted ", - (guint)g_hash_table_size (target_tbl)); - - return FALSE; - } - } + /* Also check max urls */ + if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { + if (kh_size (MESSAGE_FIELD (task, urls)) > cbd->task->cfg->max_urls) { + msg_err_task ("part has too many URLs, we cannot process more: " + "%d urls extracted ", + (guint)kh_size (MESSAGE_FIELD (task, urls))); - if ((existing = g_hash_table_lookup (target_tbl, url)) == NULL) { - url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; - g_hash_table_insert (target_tbl, url, url); - } - else { - existing->count++; + return FALSE; } } - target_tbl = NULL; + url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; + rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url); cbd->part->exceptions = g_list_prepend ( cbd->part->exceptions, @@ -3178,7 +3171,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, /* We also search the query for additional url inside */ if (url->querylen > 0) { - if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen, + if (rspamd_url_find (task->task_pool, + rspamd_url_query_unsafe (url), url->querylen, &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); @@ -3198,23 +3192,13 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, } if (query_url->protocol == PROTOCOL_MAILTO) { - if (query_url->userlen > 0) { - target_tbl = MESSAGE_FIELD (task, emails); + if (query_url->userlen == 0) { + return TRUE; } } - else { - target_tbl = MESSAGE_FIELD (task, urls); - } - if (target_tbl) { - if ((existing = g_hash_table_lookup (target_tbl, query_url)) == NULL) { - url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; - g_hash_table_insert (target_tbl, query_url, query_url); - } - else { - existing->count++; - } - } + query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; + rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), query_url); } } } @@ -3321,27 +3305,13 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED|RSPAMD_URL_FLAG_SUBJECT; if (url->protocol == PROTOCOL_MAILTO) { - if (url->userlen > 0 && url->hostlen > 0) { - if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, emails), - url)) == NULL) { - g_hash_table_insert (MESSAGE_FIELD (task, emails), url, - url); - } - else { - existing->count ++; - } - } - } - else { - if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls), - url)) == NULL) { - g_hash_table_insert (MESSAGE_FIELD (task, urls), url, url); - } - else { - existing->count ++; + if (url->userlen == 0) { + return FALSE; } } + rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url); + /* We also search the query for additional url inside */ if (url->querylen > 0) { if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen, @@ -3364,15 +3334,14 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; } - if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls), - query_url)) == NULL) { - g_hash_table_insert (MESSAGE_FIELD (task, urls), - query_url, - query_url); - } - else { - existing->count ++; + if (query_url->protocol == PROTOCOL_MAILTO) { + if (query_url->userlen == 0) { + return TRUE; + } } + + rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), + query_url); } } } @@ -3380,26 +3349,22 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, return TRUE; } -inline guint -rspamd_url_hash (gconstpointer u) +static inline khint_t +rspamd_url_hash (struct rspamd_url *url) { - const struct rspamd_url *url = u; - if (url->urllen > 0) { - return (guint)rspamd_cryptobox_fast_hash (url->string, url->urllen, + return (khint_t)rspamd_cryptobox_fast_hash (url->string, url->urllen, rspamd_hash_seed ()); } return 0; } -inline guint -rspamd_url_host_hash (gconstpointer u) +static inline khint_t +rspamd_url_host_hash (struct rspamd_url *url) { - const struct rspamd_url *url = u; - if (url->hostlen > 0) { - return (guint)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url), + return (khint_t)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url), url->hostlen, rspamd_hash_seed ()); } @@ -3407,30 +3372,10 @@ rspamd_url_host_hash (gconstpointer u) return 0; } -inline guint -rspamd_email_hash (gconstpointer u) -{ - const struct rspamd_url *url = u; - rspamd_cryptobox_fast_hash_state_t st; - - rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ()); - - if (url->hostlen > 0) { - rspamd_cryptobox_fast_hash_update (&st, rspamd_url_host_unsafe (url), url->hostlen); - } - - if (url->userlen > 0) { - rspamd_cryptobox_fast_hash_update (&st, rspamd_url_user_unsafe(url), url->userlen); - } - - return (guint)rspamd_cryptobox_fast_hash_final (&st); -} - /* Compare two emails for building emails tree */ -inline gboolean -rspamd_emails_cmp (gconstpointer a, gconstpointer b) +static inline bool +rspamd_emails_cmp (struct rspamd_url *u1, struct rspamd_url *u2) { - const struct rspamd_url *u1 = a, *u2 = b; gint r; if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { @@ -3456,30 +3401,32 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b) return FALSE; } -inline gboolean -rspamd_urls_cmp (gconstpointer a, gconstpointer b) +static inline bool +rspamd_urls_cmp (struct rspamd_url *u1, struct rspamd_url *u2) { - const struct rspamd_url *u1 = a, *u2 = b; int r = 0; - if (u1->urllen != u2->urllen) { - return FALSE; + if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) { + return false; } else { + if (u1->protocol & PROTOCOL_MAILTO) { + return rspamd_emails_cmp (u1, u2); + } + r = memcmp (u1->string, u2->string, u1->urllen); } return r == 0; } -inline gboolean -rspamd_urls_host_cmp (gconstpointer a, gconstpointer b) +static inline bool +rspamd_urls_host_cmp (struct rspamd_url *u1, struct rspamd_url *u2) { - const struct rspamd_url *u1 = a, *u2 = b; int r = 0; if (u1->hostlen != u2->hostlen) { - return FALSE; + return false; } else { r = memcmp (rspamd_url_host_unsafe (u1), rspamd_url_host_unsafe (u2), @@ -3835,6 +3782,22 @@ rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set, } bool +rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set, + struct rspamd_url *u) +{ + khiter_t k; + gint r; + + k = kh_put (rspamd_url_host_hash, set, u, &r); + + if (r == 0) { + return false; + } + + return true; +} + +bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u) { khiter_t k; @@ -3846,4 +3809,18 @@ rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u) } return true; +} + +bool +rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u) +{ + khiter_t k; + + k = kh_get (rspamd_url_hash, set, u); + + if (k == kh_end (set)) { + return false; + } + + return true; }
\ No newline at end of file diff --git a/src/libserver/url.h b/src/libserver/url.h index 358c61e16..aff7ccf5f 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -225,21 +225,6 @@ gboolean rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud); -guint rspamd_url_hash (gconstpointer u); - -guint rspamd_email_hash (gconstpointer u); - -guint rspamd_url_host_hash (gconstpointer u); - - -/* Compare two emails for building emails hash */ -gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b); - -/* Compare two urls for building emails hash */ -gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b); - -gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b); - /** * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated * @param dst @@ -296,12 +281,21 @@ KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char); bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set, struct rspamd_url *u); /** + * Helper for url host set + * @param set + * @param u + * @return + */ +bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set, + struct rspamd_url *u); +/** * Checks if a url is in set * @param set * @param u * @return */ bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u); +bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u); #ifdef __cplusplus } |