From: Vsevolod Stakhov Date: Tue, 14 Apr 2015 12:11:28 +0000 (+0100) Subject: Use hash table instead of tree for urls. X-Git-Tag: 0.9.0~262 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=fa43e3bfc7a4ca93b3ffcae730b6705bc402a2b5;p=rspamd.git Use hash table instead of tree for urls. --- diff --git a/src/libmime/message.c b/src/libmime/message.c index b94d2fb19..95a9bea95 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1718,8 +1718,8 @@ process_message (struct rspamd_task *task) if ((rc == URI_ERRNO_OK) && subject_url->hostlen > 0) { if (subject_url->protocol != PROTOCOL_MAILTO) { - if (!g_tree_lookup (task->urls, subject_url)) { - g_tree_insert (task->urls, + if (!g_hash_table_lookup (task->urls, subject_url)) { + g_hash_table_insert (task->urls, subject_url, subject_url); } diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index afae67487..b4271f1dc 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -737,23 +737,24 @@ struct url_regexp_param { gboolean found; }; -static gboolean +static void tree_url_callback (gpointer key, gpointer value, void *data) { struct url_regexp_param *param = data; struct rspamd_url *url = value; + if (param->found) { + return; + } + if (rspamd_mime_regexp_element_process (param->task, param->re, struri (url), 0, FALSE)) { param->found = TRUE; - return TRUE; } else if (G_UNLIKELY (param->re->is_test)) { msg_info ("process test regexp %s for url %s returned FALSE", struri (url)); } - - return FALSE; } static gint @@ -911,10 +912,10 @@ rspamd_mime_expr_process_regexp (struct rspamd_regexp_atom *re, callback_param.re = re; callback_param.found = FALSE; if (task->urls) { - g_tree_foreach (task->urls, tree_url_callback, &callback_param); + g_hash_table_foreach (task->urls, tree_url_callback, &callback_param); } if (task->emails && callback_param.found == FALSE) { - g_tree_foreach (task->emails, tree_url_callback, &callback_param); + g_hash_table_foreach (task->emails, tree_url_callback, &callback_param); } if (callback_param.found == FALSE) { rspamd_task_re_cache_add (task, re->regexp_text, 0); diff --git a/src/libserver/html.c b/src/libserver/html.c index 6ff4f4bae..563ac0825 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -892,14 +892,14 @@ parse_tag_url (struct rspamd_task *task, } if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen > 0) { - if (!g_tree_lookup (task->emails, url)) { - g_tree_insert (task->emails, url, url); + if (!g_hash_table_lookup (task->emails, url)) { + g_hash_table_insert (task->emails, url, url); } } } else { - if (!g_tree_lookup (task->urls, url)) { - g_tree_insert (task->urls, url, url); + if (!g_hash_table_lookup (task->urls, url)) { + g_hash_table_insert (task->urls, url, url); } } } diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index 4dfec4d7e..e48b303d7 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -607,7 +607,7 @@ struct tree_cb_data { /* * Callback for writing urls */ -static gboolean +static void urls_protocol_cb (gpointer key, gpointer value, gpointer ud) { struct tree_cb_data *cb = ud; @@ -646,12 +646,10 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) rspamd_inet_address_to_string (cb->task->from_addr), struri (url)); } - - return FALSE; } static ucl_object_t * -rspamd_urls_tree_ucl (GTree *input, struct rspamd_task *task) +rspamd_urls_tree_ucl (GHashTable *input, struct rspamd_task *task) { struct tree_cb_data cb; ucl_object_t *obj; @@ -660,12 +658,12 @@ rspamd_urls_tree_ucl (GTree *input, struct rspamd_task *task) cb.top = obj; cb.task = task; - g_tree_foreach (input, urls_protocol_cb, &cb); + g_hash_table_foreach (input, urls_protocol_cb, &cb); return obj; } -static gboolean +static void emails_protocol_cb (gpointer key, gpointer value, gpointer ud) { struct tree_cb_data *cb = ud; @@ -674,12 +672,10 @@ emails_protocol_cb (gpointer key, gpointer value, gpointer ud) obj = ucl_object_fromlstring (url->user, url->userlen + url->hostlen + 1); ucl_array_append (cb->top, obj); - - return FALSE; } static ucl_object_t * -rspamd_emails_tree_ucl (GTree *input, struct rspamd_task *task) +rspamd_emails_tree_ucl (GHashTable *input, struct rspamd_task *task) { struct tree_cb_data cb; ucl_object_t *obj; @@ -688,7 +684,7 @@ rspamd_emails_tree_ucl (GTree *input, struct rspamd_task *task) cb.top = obj; cb.task = task; - g_tree_foreach (input, emails_protocol_cb, &cb); + g_hash_table_foreach (input, emails_protocol_cb, &cb); return obj; } @@ -1009,11 +1005,11 @@ rspamd_protocol_http_reply (struct rspamd_http_message *msg, ucl_object_insert_key (top, rspamd_str_list_ucl ( task->messages), "messages", 0, false); } - if (g_tree_nnodes (task->urls) > 0) { + if (g_hash_table_size (task->urls) > 0) { ucl_object_insert_key (top, rspamd_urls_tree_ucl (task->urls, task), "urls", 0, false); } - if (g_tree_nnodes (task->emails) > 0) { + if (g_hash_table_size (task->emails) > 0) { ucl_object_insert_key (top, rspamd_emails_tree_ucl (task->emails, task), "emails", 0, false); } diff --git a/src/libserver/task.c b/src/libserver/task.c index 85f4c3ca1..91a669647 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -92,13 +92,13 @@ rspamd_task_new (struct rspamd_worker *worker) rspamd_mempool_add_destructor (new_task->task_pool, (rspamd_mempool_destruct_t) g_hash_table_unref, new_task->raw_headers); - new_task->emails = g_tree_new (rspamd_emails_cmp); + new_task->emails = g_hash_table_new (rspamd_url_hash, rspamd_emails_cmp); rspamd_mempool_add_destructor (new_task->task_pool, - (rspamd_mempool_destruct_t) g_tree_destroy, + (rspamd_mempool_destruct_t) g_hash_table_unref, new_task->emails); - new_task->urls = g_tree_new (rspamd_urls_cmp); + new_task->urls = g_hash_table_new (rspamd_url_hash, rspamd_urls_cmp); rspamd_mempool_add_destructor (new_task->task_pool, - (rspamd_mempool_destruct_t) g_tree_destroy, + (rspamd_mempool_destruct_t) g_hash_table_unref, new_task->urls); new_task->sock = -1; new_task->flags |= (RSPAMD_TASK_FLAG_MIME|RSPAMD_TASK_FLAG_JSON); diff --git a/src/libserver/task.h b/src/libserver/task.h index e845cc9e1..608be8685 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -118,8 +118,8 @@ struct rspamd_task { GList *text_parts; /**< list of text parts */ gchar *raw_headers_str; /**< list of raw headers */ GList *received; /**< list of received headers */ - GTree *urls; /**< list of parsed urls */ - GTree *emails; /**< list of parsed emails */ + GHashTable *urls; /**< list of parsed urls */ + GHashTable *emails; /**< list of parsed emails */ GList *images; /**< list of images */ GHashTable *raw_headers; /**< list of raw headers */ GHashTable *results; /**< hash table of metric_result indexed by diff --git a/src/libserver/url.c b/src/libserver/url.c index 5e2bd4044..e655ab3e4 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1313,14 +1313,14 @@ rspamd_url_text_extract (rspamd_mempool_t * pool, ex->len = url_end - url_start; if (new->protocol == PROTOCOL_MAILTO) { if (new->userlen > 0) { - if (!g_tree_lookup (task->emails, new)) { - g_tree_insert (task->emails, new, new); + if (!g_hash_table_lookup (task->emails, new)) { + g_hash_table_insert (task->emails, new, new); } } } else { - if (!g_tree_lookup (task->urls, new)) { - g_tree_insert (task->urls, new, new); + if (!g_hash_table_lookup (task->urls, new)) { + g_hash_table_insert (task->urls, new, new); } } part->urls_offset = g_list_prepend ( diff --git a/src/libutil/util.c b/src/libutil/util.c index 5e48dcdcf..640610a83 100644 --- a/src/libutil/util.c +++ b/src/libutil/util.c @@ -1536,51 +1536,70 @@ rspamd_strlcpy_tolower (gchar *dst, const gchar *src, gsize siz) return (s - src - 1); /* count does not include NUL */ } +guint +rspamd_url_hash (gconstpointer u) +{ + const struct rspamd_url *url = u; + XXH64_state_t st; + + XXH64_reset (&st, 0xdeadbabe); + + if (url->hostlen > 0) { + XXH64_update (&st, url->host, url->hostlen); + } + if (url->userlen > 0) { + XXH64_update (&st, url->user, url->userlen); + } + XXH64_update (&st, url->is_phished, sizeof (url->is_phished)); + + return XXH64_digest (&st); +} + /* Compare two emails for building emails tree */ -gint +gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b) { const struct rspamd_url *u1 = a, *u2 = b; gint r; if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { - return u1->hostlen - u2->hostlen; + return FALSE; } else { if ((r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen)) == 0) { if (u1->userlen != u2->userlen || u1->userlen == 0) { - return u1->userlen - u2->userlen; + return FALSE; } else { - return g_ascii_strncasecmp (u1->user, u2->user, u1->userlen); + return g_ascii_strncasecmp (u1->user, u2->user, u1->userlen) == 0; } } else { - return r; + return r == 0; } } - return 0; + return FALSE; } -gint +gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b) { const struct rspamd_url *u1 = a, *u2 = b; int r; if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { - return u1->hostlen - u2->hostlen; + return FALSE; } else { r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen); if (r == 0 && u1->is_phished != u2->is_phished) { /* Always insert phished urls to the tree */ - return -1; + return FALSE; } } - return r; + return r == 0; } /* diff --git a/src/libutil/util.h b/src/libutil/util.h index de28fe93b..65dbd4442 100644 --- a/src/libutil/util.h +++ b/src/libutil/util.h @@ -245,11 +245,13 @@ gsize rspamd_strlcpy_tolower (gchar *dst, const gchar *src, gsize siz); #define ts_to_usec(ts) ((ts)->tv_sec * 1000000LLU + \ (ts)->tv_nsec / 1000LLU) -/* Compare two emails for building emails tree */ -gint rspamd_emails_cmp (gconstpointer a, gconstpointer b); +guint rspamd_url_hash (gconstpointer u); -/* Compare two urls for building emails tree */ -gint rspamd_urls_cmp (gconstpointer a, gconstpointer b); +/* Compare two emails for building emails hash */ +gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b); + +/* Compare two urls for building emails hash */ +gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b); /* * Find string find in string s ignoring case diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index ae6780245..4d9f44cee 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -813,7 +813,7 @@ struct lua_tree_cb_data { int i; }; -static gboolean +static void lua_tree_url_callback (gpointer key, gpointer value, gpointer ud) { struct rspamd_url **purl; @@ -823,8 +823,6 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud) rspamd_lua_setclass (cb->L, "rspamd{url}", -1); *purl = value; lua_rawseti (cb->L, -2, cb->i++); - - return FALSE; } static gint @@ -837,7 +835,7 @@ lua_task_get_urls (lua_State * L) lua_newtable (L); cb.i = 1; cb.L = L; - g_tree_foreach (task->urls, lua_tree_url_callback, &cb); + g_hash_table_foreach (task->urls, lua_tree_url_callback, &cb); return 1; } @@ -874,7 +872,7 @@ lua_task_get_emails (lua_State * L) lua_newtable (L); cb.i = 1; cb.L = L; - g_tree_foreach (task->emails, lua_tree_url_callback, &cb); + g_hash_table_foreach (task->emails, lua_tree_url_callback, &cb); return 1; } diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index 6c1731fea..30abf950f 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -575,7 +575,7 @@ format_surbl_request (rspamd_mempool_t * pool, gboolean append_suffix, GError ** err, gboolean forced, - GTree *tree, + GHashTable *tree, struct rspamd_url *url) { GHashTable *t; @@ -714,7 +714,7 @@ format_surbl_request (rspamd_mempool_t * pool, url->surbllen = r; if (tree != NULL) { - if (g_tree_lookup (tree, result) != NULL) { + if (g_hash_table_lookup (tree, result) != NULL) { msg_debug ("url %s is already registered", result); g_set_error (err, SURBL_ERROR, /* error domain */ DUPLICATE_ERROR, /* error code */ @@ -723,7 +723,7 @@ format_surbl_request (rspamd_mempool_t * pool, return NULL; } else { - g_tree_insert (tree, result, result); + g_hash_table_insert (tree, result, url); } } @@ -754,7 +754,7 @@ format_surbl_request (rspamd_mempool_t * pool, static void make_surbl_requests (struct rspamd_url *url, struct rspamd_task *task, - struct suffix_item *suffix, gboolean forced, GTree *tree) + struct suffix_item *suffix, gboolean forced, GHashTable *tree) { gchar *surbl_req; rspamd_fstring_t f; @@ -992,7 +992,7 @@ redirector_callback (gint fd, short what, void *arg) static void register_redirector_call (struct rspamd_url *url, struct rspamd_task *task, - struct suffix_item *suffix, const gchar *rule, GTree *tree) + struct suffix_item *suffix, const gchar *rule, GHashTable *tree) { gint s = -1; struct redirector_param *param; @@ -1043,7 +1043,7 @@ register_redirector_call (struct rspamd_url *url, struct rspamd_task *task, rule); } -static gboolean +static void surbl_tree_url_callback (gpointer key, gpointer value, void *data) { struct redirector_param *param = data; @@ -1058,7 +1058,7 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data) debug_task ("check url %s", struri (url)); if (url->hostlen <= 0) { - return FALSE; + return; } if (surbl_module_ctx->use_redirector) { @@ -1095,7 +1095,7 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data) param->suffix, red_domain, param->tree); - return FALSE; + return; } } } @@ -1107,8 +1107,6 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data) make_surbl_requests (url, param->task, param->suffix, FALSE, param->tree); } - - return FALSE; } static void @@ -1119,108 +1117,9 @@ surbl_test_url (struct rspamd_task *task, void *user_data) param.task = task; param.suffix = suffix; - param.tree = g_tree_new ((GCompareFunc)strcmp); + param.tree = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_tree_destroy, + (rspamd_mempool_destruct_t)g_hash_table_unref, param.tree); - g_tree_foreach (task->urls, surbl_tree_url_callback, ¶m); -} -/* - * Handlers of URLS command - */ -#if 0 -struct urls_tree_cb_data { - gchar *buf; - gsize len; - gsize off; - struct rspamd_task *task; -}; - -static gboolean -calculate_buflen_cb (gpointer key, gpointer value, gpointer cbdata) -{ - struct urls_tree_cb_data *cb = cbdata; - struct rspamd_url *url = value; - - cb->len += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1; - - return FALSE; -} - -static gboolean -write_urls_buffer (gpointer key, gpointer value, gpointer cbdata) -{ - struct urls_tree_cb_data *cb = cbdata; - struct rspamd_url *url = value; - rspamd_fstring_t f; - gchar *urlstr; - gsize len; - - f.begin = url->host; - f.len = url->hostlen; - if ((urlstr = - format_surbl_request (cb->task->task_pool, &f, NULL, FALSE, NULL, - FALSE)) != NULL) { - len = strlen (urlstr); - if (cb->off + len >= cb->len) { - msg_info ( - "cannot write urls header completely, stripped reply at: %z", - cb->off); - return TRUE; - } - else { - cb->off += rspamd_snprintf (cb->buf + cb->off, - cb->len - cb->off, - " %s <\"%s\">,", - urlstr, - struri (url)); - } - } - - return FALSE; -} - - -static gboolean -urls_command_handler (struct rspamd_task *task) -{ - struct urls_tree_cb_data cb; - - /* First calculate buffer length */ - cb.len = sizeof (RSPAMD_REPLY_BANNER "/1.0 0 " SPAMD_OK CRLF "Urls: " CRLF); - cb.off = 0; - g_tree_foreach (task->urls, calculate_buflen_cb, &cb); - - cb.buf = rspamd_mempool_alloc (task->task_pool, cb.len * sizeof (gchar)); - cb.off += rspamd_snprintf (cb.buf + cb.off, - cb.len - cb.off, - "%s/%s 0 %s" CRLF "Urls:", - (task->proto == SPAMC_PROTO) ? SPAMD_REPLY_BANNER : RSPAMD_REPLY_BANNER, - "1.3", - SPAMD_OK); - cb.task = task; - - /* Write urls to buffer */ - g_tree_foreach (task->urls, write_urls_buffer, &cb); - - /* Strip last ',' */ - if (cb.buf[cb.off - 1] == ',') { - cb.buf[--cb.off] = '\0'; - } - /* Write result */ - if (!rspamd_dispatcher_write (task->dispatcher, cb.buf, cb.off, FALSE, - TRUE)) { - return FALSE; - } - if (!rspamd_dispatcher_write (task->dispatcher, CRLF, sizeof (CRLF) - 1, - FALSE, TRUE)) { - return FALSE; - } - task->state = STATE_REPLY; - - return TRUE; + g_hash_table_foreach (task->urls, surbl_tree_url_callback, ¶m); } -#endif -/* - * vi:ts=4 - */ diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h index 959a730de..b75bf991b 100644 --- a/src/plugins/surbl.h +++ b/src/plugins/surbl.h @@ -63,7 +63,7 @@ struct redirector_param { GString *buf; struct event ev; gint sock; - GTree *tree; + GHashTable *tree; struct suffix_item *suffix; };