aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/expressions.c2
-rw-r--r--src/fuzzy.c136
-rw-r--r--src/fuzzy.h5
-rw-r--r--src/html.c12
-rw-r--r--src/message.c20
-rw-r--r--src/message.h4
-rw-r--r--src/plugins/fuzzy_check.c11
-rw-r--r--src/plugins/regexp.c35
-rw-r--r--src/printf.c10
-rw-r--r--src/printf.h1
-rw-r--r--src/url.c41
-rw-r--r--src/url.h5
12 files changed, 213 insertions, 69 deletions
diff --git a/src/expressions.c b/src/expressions.c
index e590ad630..fa6ce0fef 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -1059,7 +1059,7 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused)
return FALSE;
}
if (!p1->is_empty && !p2->is_empty) {
- diff = fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy);
+ diff = fuzzy_compare_parts (p1, p2);
debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold);
if (diff <= threshold) {
return TRUE;
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 61ef5647e..ce5217b5b 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -27,6 +27,9 @@
#include "mem_pool.h"
#include "fstring.h"
#include "fuzzy.h"
+#include "message.h"
+#include "url.h"
+#include "main.h"
#define ROLL_WINDOW_SIZE 9
#define MIN_FUZZY_BLOCK_SIZE 3
@@ -81,16 +84,17 @@ fuzzy_fnv_hash (gchar c, guint32 hval)
static guint32
fuzzy_blocksize (guint32 len)
{
+ guint32 nlen = MIN_FUZZY_BLOCK_SIZE;
- if (len < MIN_FUZZY_BLOCK_SIZE) {
- return MIN_FUZZY_BLOCK_SIZE;
+ while (nlen * (FUZZY_HASHLEN - 1) < len) {
+ nlen *= 2;
}
- return g_spaced_primes_closest (len / FUZZY_HASHLEN);
+ return nlen;
}
/* Update hash with new symbol */
-void
+static void
fuzzy_update (fuzzy_hash_t * h, gchar c)
{
h->rh = fuzzy_roll_hash (c);
@@ -105,6 +109,30 @@ fuzzy_update (fuzzy_hash_t * h, gchar c)
}
}
+static void
+fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, gchar c)
+{
+ h1->rh = fuzzy_roll_hash (c);
+ h1->h = fuzzy_fnv_hash (c, h1->h);
+ h2->rh = h1->rh;
+ h2->h = fuzzy_fnv_hash (c, h2->h);
+
+ if (h1->rh % h1->block_size == (h1->block_size - 1)) {
+ h1->hash_pipe[h1->hi] = b64[h1->h % 64];
+ if (h1->hi < FUZZY_HASHLEN - 2) {
+ h1->h = HASH_INIT;
+ h1->hi++;
+ }
+ }
+ if (h2->rh % h2->block_size == (h2->block_size - 1)) {
+ h2->hash_pipe[h2->hi] = b64[h2->h % 64];
+ if (h2->hi < FUZZY_HASHLEN - 2) {
+ h2->h = HASH_INIT;
+ h2->hi++;
+ }
+ }
+}
+
/*
* Levenshtein distance between string1 and string2.
*
@@ -284,6 +312,90 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool)
return fuzzy_init (&f, pool);
}
+void
+fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
+{
+ fuzzy_hash_t *new, *new2;
+ gint i;
+ gchar *c;
+ gsize real_len = 0, len = part->content->len;
+ GList *cur_offset;
+ struct uri *cur_url = NULL;
+ GString *debug;
+
+ cur_offset = part->urls_offset;
+ if (cur_offset != NULL) {
+ cur_url = cur_offset->data;
+ }
+
+ c = part->content->data;
+ new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
+ new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
+ bzero (&rs, sizeof (rs));
+ for (i = 0; i < len;) {
+ if (cur_url != NULL && cur_url->pos == i) {
+ i += cur_url->len + 1;
+ c += cur_url->len + 1;
+ cur_offset = g_list_next (cur_offset);
+ if (cur_offset != NULL) {
+ cur_url = cur_offset->data;
+ }
+ }
+ else {
+ if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
+ real_len ++;
+ }
+ c++;
+ i++;
+ }
+ }
+
+ debug = g_string_sized_new (real_len);
+
+ new->block_size = fuzzy_blocksize (real_len);
+ new2->block_size = new->block_size * 2;
+
+ cur_offset = part->urls_offset;
+ if (cur_offset != NULL) {
+ cur_url = cur_offset->data;
+ }
+
+ c = part->content->data;
+
+ for (i = 0; i < len;) {
+ if (cur_url != NULL && cur_url->pos == i) {
+ i += cur_url->len + 1;
+ c += cur_url->len + 1;
+ cur_offset = g_list_next (cur_offset);
+ if (cur_offset != NULL) {
+ cur_url = cur_offset->data;
+ }
+ msg_info ("skip url block of %d symbols", cur_url->len);
+ }
+ else {
+ if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
+ fuzzy_update2 (new, new2, *c);
+ g_string_append_c (debug, *c);
+ }
+ c++;
+ i++;
+ }
+ }
+
+ msg_info ("make hash of string: %v", debug);
+
+ /* Check whether we have more bytes in a rolling window */
+ if (new->rh != 0) {
+ new->hash_pipe[new->hi] = b64[new->h % 64];
+ }
+ if (new2->rh != 0) {
+ new2->hash_pipe[new2->hi] = b64[new2->h % 64];
+ }
+
+ part->fuzzy = new;
+ part->double_fuzzy = new2;
+}
+
/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
gint
fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2)
@@ -308,6 +420,22 @@ fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2)
return res;
}
+gint
+fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2)
+{
+ if (p1->fuzzy->block_size == p2->fuzzy->block_size) {
+ return fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy);
+ }
+ else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) {
+ return fuzzy_compare_hashes (p1->double_fuzzy, p2->fuzzy);
+ }
+ else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) {
+ return fuzzy_compare_hashes (p2->double_fuzzy, p1->fuzzy);
+ }
+
+ return 0;
+}
+
/*
* vi:ts=4
*/
diff --git a/src/fuzzy.h b/src/fuzzy.h
index b5b3856e6..271bfee2a 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -20,6 +20,8 @@ typedef struct fuzzy_hash_s {
guint32 hi; /**< current index in hash pipe */
} fuzzy_hash_t;
+struct mime_text_part;
+
/**
* Calculate fuzzy hash for specified string
* @param in input string
@@ -28,6 +30,9 @@ typedef struct fuzzy_hash_s {
*/
fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
+void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool);
+
+gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);
/**
* Compare score of difference between two hashes
diff --git a/src/html.c b/src/html.c
index e686570a0..3582022f8 100644
--- a/src/html.c
+++ b/src/html.c
@@ -687,7 +687,7 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url
gchar tagbuf[128];
struct html_tag *tag;
gsize len = 0;
- gint off, rc;
+ gint rc;
p = url_text;
while (len < remain) {
@@ -719,7 +719,7 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url
p ++;
}
- if (url_try_text (task->task_pool, url_text, len, &off, &url_str) && url_str != NULL) {
+ if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str) && url_str != NULL) {
new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri));
if (new != NULL) {
g_strstrip (url_str);
@@ -864,13 +864,10 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
/*
* Check for phishing
*/
- if ((p = strchr (c, '>')) != NULL ) {
+ if ((p = strchr (c, '>')) != NULL && id == Tag_A) {
p ++;
check_phishing (task, url, p, remain - (p - tag_text), id);
}
- if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
- g_tree_insert (part->html_urls, url_text, url);
- }
if (g_tree_lookup (task->urls, url) == NULL) {
g_tree_insert (task->urls, url, url);
}
@@ -938,7 +935,8 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
/* Skip some tags */
if (data->tag && (data->tag->id == Tag_STYLE ||
data->tag->id == Tag_SCRIPT ||
- data->tag->id == Tag_OBJECT)) {
+ data->tag->id == Tag_OBJECT ||
+ data->tag->id == Tag_TITLE)) {
return FALSE;
}
}
diff --git a/src/message.c b/src/message.c
index 8d36ad3eb..0586be8d7 100644
--- a/src/message.c
+++ b/src/message.c
@@ -784,9 +784,6 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
text_part->html_nodes = NULL;
text_part->parent = parent;
- text_part->html_urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
- text_part->urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
-
text_part->content = strip_html_tags (task, task->task_pool, text_part, text_part->orig, NULL);
if (text_part->html_nodes == NULL) {
@@ -800,10 +797,8 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
#endif
}
- text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+ fuzzy_init_part (text_part, task->task_pool);
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content);
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->html_urls);
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->urls);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
else if (g_mime_content_type_is_type (type, "text", "*")) {
@@ -821,12 +816,9 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
}
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
text_part->content = text_part->orig;
- text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
- text_part->html_urls = NULL;
- text_part->urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
url_parse_text (task->task_pool, task, text_part, FALSE);
+ fuzzy_init_part (text_part, task->task_pool);
task->text_parts = g_list_prepend (task->text_parts, text_part);
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->urls);
}
}
@@ -973,10 +965,10 @@ process_message (struct worker_task *task)
GMimePart *part;
GMimeDataWrapper *wrapper;
struct received_header *recv;
- gchar *mid, *url_str, *p, *end;
+ gchar *mid, *url_str, *p, *end, *url_end;
struct uri *subject_url;
gsize len;
- gint pos, rc;
+ gint rc;
tmp = memory_pool_alloc (task->task_pool, sizeof (GByteArray));
tmp->data = task->msg->begin;
@@ -1127,7 +1119,7 @@ process_message (struct worker_task *task)
while (p < end) {
/* Search to the end of url */
- if (url_try_text (task->task_pool, p, end - p, &pos, &url_str)) {
+ if (url_try_text (task->task_pool, p, end - p, NULL, &url_end, &url_str)) {
if (url_str != NULL) {
subject_url = memory_pool_alloc0 (task->task_pool, sizeof (struct uri));
if (subject_url != NULL) {
@@ -1150,7 +1142,7 @@ process_message (struct worker_task *task)
else {
break;
}
- p += pos;
+ p = url_end + 1;
}
/* Free header's list */
g_list_free (cur);
diff --git a/src/message.h b/src/message.h
index e70dd07e2..5f19ab892 100644
--- a/src/message.h
+++ b/src/message.h
@@ -30,9 +30,9 @@ struct mime_text_part {
GByteArray *orig;
GByteArray *content;
GNode *html_nodes;
- GTree *urls;
- GTree *html_urls;
+ GList *urls_offset; /**< list of offsets of urls */
fuzzy_hash_t *fuzzy;
+ fuzzy_hash_t *double_fuzzy;
GMimeObject *parent;
};
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index 3f068ed99..93ba4bf25 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -686,6 +686,7 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused)
}
register_fuzzy_call (task, part->fuzzy);
+ register_fuzzy_call (task, part->double_fuzzy);
cur = g_list_next (cur);
}
@@ -843,6 +844,16 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in)
free_task (task, FALSE);
return;
}
+ if (! register_fuzzy_controller_call (session, task, part->double_fuzzy, cmd, value, flag, saved)) {
+ /* Cannot write hash */
+ session->state = STATE_REPLY;
+ r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF);
+ if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) {
+ return;
+ }
+ free_task (task, FALSE);
+ return;
+ }
cur = g_list_next (cur);
}
/* Process images */
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 441a17de5..a82c60110 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -845,31 +845,16 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
/*XXX: add support of it */
msg_warn ("numbered matches are not supported for url regexp");
}
- cur = g_list_first (task->text_parts);
- while (cur) {
- part = (struct mime_text_part *)cur->data;
- /* Skip empty parts */
- if (part->is_empty) {
- cur = g_list_next (cur);
- continue;
- }
- if (part->is_raw) {
- regexp = re->raw_regexp;
- }
- else {
- regexp = re->regexp;
- }
- callback_param.task = task;
- callback_param.regexp = regexp;
- callback_param.re = re;
- callback_param.found = FALSE;
- if (part->urls) {
- g_tree_foreach (part->urls, tree_url_callback, &callback_param);
- }
- if (part->html_urls && callback_param.found == FALSE) {
- g_tree_foreach (part->html_urls, tree_url_callback, &callback_param);
- }
- cur = g_list_next (cur);
+ regexp = re->regexp;
+ callback_param.task = task;
+ callback_param.regexp = regexp;
+ callback_param.re = re;
+ callback_param.found = FALSE;
+ if (task->urls) {
+ g_tree_foreach (task->urls, tree_url_callback, &callback_param);
+ }
+ if (task->emails && callback_param.found == FALSE) {
+ g_tree_foreach (task->emails, tree_url_callback, &callback_param);
}
if (callback_param.found == FALSE) {
task_cache_add (task, re, 0);
diff --git a/src/printf.c b/src/printf.c
index a8bf0fdb1..4155f9478 100644
--- a/src/printf.c
+++ b/src/printf.c
@@ -216,6 +216,7 @@ rspamd_vsnprintf (gchar *buf, glong max, const gchar *fmt, va_list args)
guint64 ui64;
guint width, sign, hex, max_width, frac_width, i;
f_str_t *v;
+ GString *gs;
if (max <= 0) {
return buf;
@@ -312,6 +313,15 @@ rspamd_vsnprintf (gchar *buf, glong max, const gchar *fmt, va_list args)
continue;
+ case 'v':
+ gs = va_arg (args, GString *);
+ len = gs->len;
+ len = (buf + len < last) ? len : (size_t) (last - buf);
+
+ buf = ((gchar *)memcpy (buf, gs->str, len)) + len;
+ fmt++;
+ break;
+
case 's':
p = va_arg(args, gchar *);
if (p == NULL) {
diff --git a/src/printf.h b/src/printf.h
index 0d41bdc80..c1c6866d6 100644
--- a/src/printf.h
+++ b/src/printf.h
@@ -44,6 +44,7 @@
* %r rlim_t
* %p void *
* %V f_str_t *
+ * %v GString *
* %s null-terminated string
* %S ascii null-terminated string
* %*s length and string
diff --git a/src/url.c b/src/url.c
index 83492eaab..dbc04ffab 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1157,10 +1157,10 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match
void
url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
{
- gint rc, off = 0;
- gchar *url_str = NULL;
+ gint rc;
+ gchar *url_str = NULL, *url_start, *url_end;
struct uri *new;
- const guint8 *p, *end;
+ gchar *p, *end, *begin;
if (!part->orig->data || part->orig->len == 0) {
@@ -1170,34 +1170,37 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
if (url_init () == 0) {
if (is_html) {
- p = part->orig->data;
- end = p + part->orig->len;
+ begin = part->orig->data;
+ end = begin + part->orig->len;
+ p = begin;
}
else {
- p = part->content->data;
- end = p + part->content->len;
+ begin = part->content->data;
+ end = begin + part->content->len;
+ p = begin;
}
while (p < end) {
- if (url_try_text (pool, p, end - p, &off, &url_str)) {
- if (url_str != NULL &&
- g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+ if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) {
+ if (url_str != NULL) {
new = memory_pool_alloc0 (pool, sizeof (struct uri));
if (new != NULL) {
g_strstrip (url_str);
rc = parse_uri (new, url_str, pool);
if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) &&
new->hostlen > 0) {
+ new->pos = url_start - begin;
+ new->len = url_end - url_start;
if (new->protocol == PROTOCOL_MAILTO) {
if (!g_tree_lookup (task->emails, new)) {
g_tree_insert (task->emails, new, new);
}
}
else {
- g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
if (!g_tree_lookup (task->urls, new)) {
g_tree_insert (task->urls, new, new);
}
}
+ part->urls_offset = g_list_prepend (part->urls_offset, new);
}
else if (rc != URI_ERRNO_OK) {
msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
@@ -1208,13 +1211,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
else {
break;
}
- p += off;
+ p = url_end + 1;
}
}
+ /* Handle offsets of this part */
+ if (part->urls_offset != NULL) {
+ part->urls_offset = g_list_reverse (part->urls_offset);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, part->urls_offset);
+ }
}
gboolean
-url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str)
+url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **fin, gchar **url_str)
{
const gchar *end, *pos;
gint idx, l;
@@ -1247,8 +1255,11 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gch
else {
*url_str = NULL;
}
- if (res) {
- *res = (pos - begin) + strlen (matcher->pattern);
+ if (start != NULL) {
+ *start = (gchar *)pos;
+ }
+ if (fin != NULL) {
+ *fin = (gchar *)pos + m.m_len;
}
return TRUE;
}
diff --git a/src/url.h b/src/url.h
index eb11ceba3..9c0812e62 100644
--- a/src/url.h
+++ b/src/url.h
@@ -32,6 +32,9 @@ struct uri {
struct uri *phished_url;
+ gsize pos;
+ gsize len;
+
/* @protocollen should only be usable if @protocol is either
* PROTOCOL_USER or an uri string should be composed. */
guint protocollen;
@@ -76,7 +79,7 @@ enum protocol {
void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool);
-gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str);
+gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **end, gchar **url_str);
const gchar* url_strerror (enum uri_errno err);
#endif