From: Vsevolod Stakhov Date: Thu, 23 Jun 2011 15:05:58 +0000 (+0400) Subject: * Fixes to fuzzy hashing logic, skip urls while estimating fuzzy hash X-Git-Tag: 0.4.0~44 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=5022c0333ffd8ce5eca3dc9e2679b612e2c9ce99;p=rspamd.git * Fixes to fuzzy hashing logic, skip urls while estimating fuzzy hash Fix tags stripping. Fix phishing checks (ignore img tags). --- diff --git a/src/expressions.c b/src/expressions.c index e590ad630..fa6ce0fef 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -1059,7 +1059,7 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused) return FALSE; } if (!p1->is_empty && !p2->is_empty) { - diff = fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy); + diff = fuzzy_compare_parts (p1, p2); debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold); if (diff <= threshold) { return TRUE; diff --git a/src/fuzzy.c b/src/fuzzy.c index 61ef5647e..ce5217b5b 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -27,6 +27,9 @@ #include "mem_pool.h" #include "fstring.h" #include "fuzzy.h" +#include "message.h" +#include "url.h" +#include "main.h" #define ROLL_WINDOW_SIZE 9 #define MIN_FUZZY_BLOCK_SIZE 3 @@ -81,16 +84,17 @@ fuzzy_fnv_hash (gchar c, guint32 hval) static guint32 fuzzy_blocksize (guint32 len) { + guint32 nlen = MIN_FUZZY_BLOCK_SIZE; - if (len < MIN_FUZZY_BLOCK_SIZE) { - return MIN_FUZZY_BLOCK_SIZE; + while (nlen * (FUZZY_HASHLEN - 1) < len) { + nlen *= 2; } - return g_spaced_primes_closest (len / FUZZY_HASHLEN); + return nlen; } /* Update hash with new symbol */ -void +static void fuzzy_update (fuzzy_hash_t * h, gchar c) { h->rh = fuzzy_roll_hash (c); @@ -105,6 +109,30 @@ fuzzy_update (fuzzy_hash_t * h, gchar c) } } +static void +fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, gchar c) +{ + h1->rh = fuzzy_roll_hash (c); + h1->h = fuzzy_fnv_hash (c, h1->h); + h2->rh = h1->rh; + h2->h = fuzzy_fnv_hash (c, h2->h); + + if (h1->rh % h1->block_size == (h1->block_size - 1)) { + h1->hash_pipe[h1->hi] = b64[h1->h % 64]; + if (h1->hi < FUZZY_HASHLEN - 2) { + h1->h = HASH_INIT; + h1->hi++; + } + } + if (h2->rh % h2->block_size == (h2->block_size - 1)) { + h2->hash_pipe[h2->hi] = b64[h2->h % 64]; + if (h2->hi < FUZZY_HASHLEN - 2) { + h2->h = HASH_INIT; + h2->hi++; + } + } +} + /* * Levenshtein distance between string1 and string2. * @@ -284,6 +312,90 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool) return fuzzy_init (&f, pool); } +void +fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) +{ + fuzzy_hash_t *new, *new2; + gint i; + gchar *c; + gsize real_len = 0, len = part->content->len; + GList *cur_offset; + struct uri *cur_url = NULL; + GString *debug; + + cur_offset = part->urls_offset; + if (cur_offset != NULL) { + cur_url = cur_offset->data; + } + + c = part->content->data; + new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); + new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); + bzero (&rs, sizeof (rs)); + for (i = 0; i < len;) { + if (cur_url != NULL && cur_url->pos == i) { + i += cur_url->len + 1; + c += cur_url->len + 1; + cur_offset = g_list_next (cur_offset); + if (cur_offset != NULL) { + cur_url = cur_offset->data; + } + } + else { + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { + real_len ++; + } + c++; + i++; + } + } + + debug = g_string_sized_new (real_len); + + new->block_size = fuzzy_blocksize (real_len); + new2->block_size = new->block_size * 2; + + cur_offset = part->urls_offset; + if (cur_offset != NULL) { + cur_url = cur_offset->data; + } + + c = part->content->data; + + for (i = 0; i < len;) { + if (cur_url != NULL && cur_url->pos == i) { + i += cur_url->len + 1; + c += cur_url->len + 1; + cur_offset = g_list_next (cur_offset); + if (cur_offset != NULL) { + cur_url = cur_offset->data; + } + msg_info ("skip url block of %d symbols", cur_url->len); + } + else { + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { + fuzzy_update2 (new, new2, *c); + g_string_append_c (debug, *c); + } + c++; + i++; + } + } + + msg_info ("make hash of string: %v", debug); + + /* Check whether we have more bytes in a rolling window */ + if (new->rh != 0) { + new->hash_pipe[new->hi] = b64[new->h % 64]; + } + if (new2->rh != 0) { + new2->hash_pipe[new2->hi] = b64[new2->h % 64]; + } + + part->fuzzy = new; + part->double_fuzzy = new2; +} + /* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ gint fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2) @@ -308,6 +420,22 @@ fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2) return res; } +gint +fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2) +{ + if (p1->fuzzy->block_size == p2->fuzzy->block_size) { + return fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy); + } + else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) { + return fuzzy_compare_hashes (p1->double_fuzzy, p2->fuzzy); + } + else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) { + return fuzzy_compare_hashes (p2->double_fuzzy, p1->fuzzy); + } + + return 0; +} + /* * vi:ts=4 */ diff --git a/src/fuzzy.h b/src/fuzzy.h index b5b3856e6..271bfee2a 100644 --- a/src/fuzzy.h +++ b/src/fuzzy.h @@ -20,6 +20,8 @@ typedef struct fuzzy_hash_s { guint32 hi; /**< current index in hash pipe */ } fuzzy_hash_t; +struct mime_text_part; + /** * Calculate fuzzy hash for specified string * @param in input string @@ -28,6 +30,9 @@ typedef struct fuzzy_hash_s { */ fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool); fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool); +void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool); + +gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2); /** * Compare score of difference between two hashes diff --git a/src/html.c b/src/html.c index e686570a0..3582022f8 100644 --- a/src/html.c +++ b/src/html.c @@ -687,7 +687,7 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url gchar tagbuf[128]; struct html_tag *tag; gsize len = 0; - gint off, rc; + gint rc; p = url_text; while (len < remain) { @@ -719,7 +719,7 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url p ++; } - if (url_try_text (task->task_pool, url_text, len, &off, &url_str) && url_str != NULL) { + if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str) && url_str != NULL) { new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri)); if (new != NULL) { g_strstrip (url_str); @@ -864,13 +864,10 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i /* * Check for phishing */ - if ((p = strchr (c, '>')) != NULL ) { + if ((p = strchr (c, '>')) != NULL && id == Tag_A) { p ++; check_phishing (task, url, p, remain - (p - tag_text), id); } - if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) { - g_tree_insert (part->html_urls, url_text, url); - } if (g_tree_lookup (task->urls, url) == NULL) { g_tree_insert (task->urls, url, url); } @@ -938,7 +935,8 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_ /* Skip some tags */ if (data->tag && (data->tag->id == Tag_STYLE || data->tag->id == Tag_SCRIPT || - data->tag->id == Tag_OBJECT)) { + data->tag->id == Tag_OBJECT || + data->tag->id == Tag_TITLE)) { return FALSE; } } diff --git a/src/message.c b/src/message.c index 8d36ad3eb..0586be8d7 100644 --- a/src/message.c +++ b/src/message.c @@ -784,9 +784,6 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part->html_nodes = NULL; text_part->parent = parent; - text_part->html_urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); - text_part->urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); - text_part->content = strip_html_tags (task, task->task_pool, text_part, text_part->orig, NULL); if (text_part->html_nodes == NULL) { @@ -800,10 +797,8 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont #endif } - text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + fuzzy_init_part (text_part, task->task_pool); memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content); - memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->html_urls); - memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->urls); task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "*")) { @@ -821,12 +816,9 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont } text_part->orig = convert_text_to_utf (task, part_content, type, text_part); text_part->content = text_part->orig; - text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); - text_part->html_urls = NULL; - text_part->urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); url_parse_text (task->task_pool, task, text_part, FALSE); + fuzzy_init_part (text_part, task->task_pool); task->text_parts = g_list_prepend (task->text_parts, text_part); - memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->urls); } } @@ -973,10 +965,10 @@ process_message (struct worker_task *task) GMimePart *part; GMimeDataWrapper *wrapper; struct received_header *recv; - gchar *mid, *url_str, *p, *end; + gchar *mid, *url_str, *p, *end, *url_end; struct uri *subject_url; gsize len; - gint pos, rc; + gint rc; tmp = memory_pool_alloc (task->task_pool, sizeof (GByteArray)); tmp->data = task->msg->begin; @@ -1127,7 +1119,7 @@ process_message (struct worker_task *task) while (p < end) { /* Search to the end of url */ - if (url_try_text (task->task_pool, p, end - p, &pos, &url_str)) { + if (url_try_text (task->task_pool, p, end - p, NULL, &url_end, &url_str)) { if (url_str != NULL) { subject_url = memory_pool_alloc0 (task->task_pool, sizeof (struct uri)); if (subject_url != NULL) { @@ -1150,7 +1142,7 @@ process_message (struct worker_task *task) else { break; } - p += pos; + p = url_end + 1; } /* Free header's list */ g_list_free (cur); diff --git a/src/message.h b/src/message.h index e70dd07e2..5f19ab892 100644 --- a/src/message.h +++ b/src/message.h @@ -30,9 +30,9 @@ struct mime_text_part { GByteArray *orig; GByteArray *content; GNode *html_nodes; - GTree *urls; - GTree *html_urls; + GList *urls_offset; /**< list of offsets of urls */ fuzzy_hash_t *fuzzy; + fuzzy_hash_t *double_fuzzy; GMimeObject *parent; }; diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 3f068ed99..93ba4bf25 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -686,6 +686,7 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) } register_fuzzy_call (task, part->fuzzy); + register_fuzzy_call (task, part->double_fuzzy); cur = g_list_next (cur); } @@ -843,6 +844,16 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) free_task (task, FALSE); return; } + if (! register_fuzzy_controller_call (session, task, part->double_fuzzy, cmd, value, flag, saved)) { + /* Cannot write hash */ + session->state = STATE_REPLY; + r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF); + if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { + return; + } + free_task (task, FALSE); + return; + } cur = g_list_next (cur); } /* Process images */ diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 441a17de5..a82c60110 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -845,31 +845,16 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar /*XXX: add support of it */ msg_warn ("numbered matches are not supported for url regexp"); } - cur = g_list_first (task->text_parts); - while (cur) { - part = (struct mime_text_part *)cur->data; - /* Skip empty parts */ - if (part->is_empty) { - cur = g_list_next (cur); - continue; - } - if (part->is_raw) { - regexp = re->raw_regexp; - } - else { - regexp = re->regexp; - } - callback_param.task = task; - callback_param.regexp = regexp; - callback_param.re = re; - callback_param.found = FALSE; - if (part->urls) { - g_tree_foreach (part->urls, tree_url_callback, &callback_param); - } - if (part->html_urls && callback_param.found == FALSE) { - g_tree_foreach (part->html_urls, tree_url_callback, &callback_param); - } - cur = g_list_next (cur); + regexp = re->regexp; + callback_param.task = task; + callback_param.regexp = regexp; + callback_param.re = re; + callback_param.found = FALSE; + if (task->urls) { + g_tree_foreach (task->urls, tree_url_callback, &callback_param); + } + if (task->emails && callback_param.found == FALSE) { + g_tree_foreach (task->emails, tree_url_callback, &callback_param); } if (callback_param.found == FALSE) { task_cache_add (task, re, 0); diff --git a/src/printf.c b/src/printf.c index a8bf0fdb1..4155f9478 100644 --- a/src/printf.c +++ b/src/printf.c @@ -216,6 +216,7 @@ rspamd_vsnprintf (gchar *buf, glong max, const gchar *fmt, va_list args) guint64 ui64; guint width, sign, hex, max_width, frac_width, i; f_str_t *v; + GString *gs; if (max <= 0) { return buf; @@ -312,6 +313,15 @@ rspamd_vsnprintf (gchar *buf, glong max, const gchar *fmt, va_list args) continue; + case 'v': + gs = va_arg (args, GString *); + len = gs->len; + len = (buf + len < last) ? len : (size_t) (last - buf); + + buf = ((gchar *)memcpy (buf, gs->str, len)) + len; + fmt++; + break; + case 's': p = va_arg(args, gchar *); if (p == NULL) { diff --git a/src/printf.h b/src/printf.h index 0d41bdc80..c1c6866d6 100644 --- a/src/printf.h +++ b/src/printf.h @@ -44,6 +44,7 @@ * %r rlim_t * %p void * * %V f_str_t * + * %v GString * * %s null-terminated string * %S ascii null-terminated string * %*s length and string diff --git a/src/url.c b/src/url.c index 83492eaab..dbc04ffab 100644 --- a/src/url.c +++ b/src/url.c @@ -1157,10 +1157,10 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match void url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html) { - gint rc, off = 0; - gchar *url_str = NULL; + gint rc; + gchar *url_str = NULL, *url_start, *url_end; struct uri *new; - const guint8 *p, *end; + gchar *p, *end, *begin; if (!part->orig->data || part->orig->len == 0) { @@ -1170,34 +1170,37 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text if (url_init () == 0) { if (is_html) { - p = part->orig->data; - end = p + part->orig->len; + begin = part->orig->data; + end = begin + part->orig->len; + p = begin; } else { - p = part->content->data; - end = p + part->content->len; + begin = part->content->data; + end = begin + part->content->len; + p = begin; } while (p < end) { - if (url_try_text (pool, p, end - p, &off, &url_str)) { - if (url_str != NULL && - g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { + if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) { + if (url_str != NULL) { new = memory_pool_alloc0 (pool, sizeof (struct uri)); if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, pool); if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && new->hostlen > 0) { + new->pos = url_start - begin; + new->len = url_end - url_start; if (new->protocol == PROTOCOL_MAILTO) { if (!g_tree_lookup (task->emails, new)) { g_tree_insert (task->emails, new, new); } } else { - g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); if (!g_tree_lookup (task->urls, new)) { g_tree_insert (task->urls, new, new); } } + part->urls_offset = g_list_prepend (part->urls_offset, new); } else if (rc != URI_ERRNO_OK) { msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); @@ -1208,13 +1211,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text else { break; } - p += off; + p = url_end + 1; } } + /* Handle offsets of this part */ + if (part->urls_offset != NULL) { + part->urls_offset = g_list_reverse (part->urls_offset); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, part->urls_offset); + } } gboolean -url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str) +url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **fin, gchar **url_str) { const gchar *end, *pos; gint idx, l; @@ -1247,8 +1255,11 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gch else { *url_str = NULL; } - if (res) { - *res = (pos - begin) + strlen (matcher->pattern); + if (start != NULL) { + *start = (gchar *)pos; + } + if (fin != NULL) { + *fin = (gchar *)pos + m.m_len; } return TRUE; } diff --git a/src/url.h b/src/url.h index eb11ceba3..9c0812e62 100644 --- a/src/url.h +++ b/src/url.h @@ -32,6 +32,9 @@ struct uri { struct uri *phished_url; + gsize pos; + gsize len; + /* @protocollen should only be usable if @protocol is either * PROTOCOL_USER or an uri string should be composed. */ guint protocollen; @@ -76,7 +79,7 @@ enum protocol { void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html); enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool); -gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str); +gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **end, gchar **url_str); const gchar* url_strerror (enum uri_errno err); #endif