Fix tags stripping. Fix phishing checks (ignore img tags).tags/0.4.0
@@ -1059,7 +1059,7 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused) | |||
return FALSE; | |||
} | |||
if (!p1->is_empty && !p2->is_empty) { | |||
diff = fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy); | |||
diff = fuzzy_compare_parts (p1, p2); | |||
debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold); | |||
if (diff <= threshold) { | |||
return TRUE; |
@@ -27,6 +27,9 @@ | |||
#include "mem_pool.h" | |||
#include "fstring.h" | |||
#include "fuzzy.h" | |||
#include "message.h" | |||
#include "url.h" | |||
#include "main.h" | |||
#define ROLL_WINDOW_SIZE 9 | |||
#define MIN_FUZZY_BLOCK_SIZE 3 | |||
@@ -81,16 +84,17 @@ fuzzy_fnv_hash (gchar c, guint32 hval) | |||
static guint32 | |||
fuzzy_blocksize (guint32 len) | |||
{ | |||
guint32 nlen = MIN_FUZZY_BLOCK_SIZE; | |||
if (len < MIN_FUZZY_BLOCK_SIZE) { | |||
return MIN_FUZZY_BLOCK_SIZE; | |||
while (nlen * (FUZZY_HASHLEN - 1) < len) { | |||
nlen *= 2; | |||
} | |||
return g_spaced_primes_closest (len / FUZZY_HASHLEN); | |||
return nlen; | |||
} | |||
/* Update hash with new symbol */ | |||
void | |||
static void | |||
fuzzy_update (fuzzy_hash_t * h, gchar c) | |||
{ | |||
h->rh = fuzzy_roll_hash (c); | |||
@@ -105,6 +109,30 @@ fuzzy_update (fuzzy_hash_t * h, gchar c) | |||
} | |||
} | |||
static void | |||
fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, gchar c) | |||
{ | |||
h1->rh = fuzzy_roll_hash (c); | |||
h1->h = fuzzy_fnv_hash (c, h1->h); | |||
h2->rh = h1->rh; | |||
h2->h = fuzzy_fnv_hash (c, h2->h); | |||
if (h1->rh % h1->block_size == (h1->block_size - 1)) { | |||
h1->hash_pipe[h1->hi] = b64[h1->h % 64]; | |||
if (h1->hi < FUZZY_HASHLEN - 2) { | |||
h1->h = HASH_INIT; | |||
h1->hi++; | |||
} | |||
} | |||
if (h2->rh % h2->block_size == (h2->block_size - 1)) { | |||
h2->hash_pipe[h2->hi] = b64[h2->h % 64]; | |||
if (h2->hi < FUZZY_HASHLEN - 2) { | |||
h2->h = HASH_INIT; | |||
h2->hi++; | |||
} | |||
} | |||
} | |||
/* | |||
* Levenshtein distance between string1 and string2. | |||
* | |||
@@ -284,6 +312,90 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool) | |||
return fuzzy_init (&f, pool); | |||
} | |||
void | |||
fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) | |||
{ | |||
fuzzy_hash_t *new, *new2; | |||
gint i; | |||
gchar *c; | |||
gsize real_len = 0, len = part->content->len; | |||
GList *cur_offset; | |||
struct uri *cur_url = NULL; | |||
GString *debug; | |||
cur_offset = part->urls_offset; | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
} | |||
c = part->content->data; | |||
new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); | |||
new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); | |||
bzero (&rs, sizeof (rs)); | |||
for (i = 0; i < len;) { | |||
if (cur_url != NULL && cur_url->pos == i) { | |||
i += cur_url->len + 1; | |||
c += cur_url->len + 1; | |||
cur_offset = g_list_next (cur_offset); | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
} | |||
} | |||
else { | |||
if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { | |||
real_len ++; | |||
} | |||
c++; | |||
i++; | |||
} | |||
} | |||
debug = g_string_sized_new (real_len); | |||
new->block_size = fuzzy_blocksize (real_len); | |||
new2->block_size = new->block_size * 2; | |||
cur_offset = part->urls_offset; | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
} | |||
c = part->content->data; | |||
for (i = 0; i < len;) { | |||
if (cur_url != NULL && cur_url->pos == i) { | |||
i += cur_url->len + 1; | |||
c += cur_url->len + 1; | |||
cur_offset = g_list_next (cur_offset); | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
} | |||
msg_info ("skip url block of %d symbols", cur_url->len); | |||
} | |||
else { | |||
if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { | |||
fuzzy_update2 (new, new2, *c); | |||
g_string_append_c (debug, *c); | |||
} | |||
c++; | |||
i++; | |||
} | |||
} | |||
msg_info ("make hash of string: %v", debug); | |||
/* Check whether we have more bytes in a rolling window */ | |||
if (new->rh != 0) { | |||
new->hash_pipe[new->hi] = b64[new->h % 64]; | |||
} | |||
if (new2->rh != 0) { | |||
new2->hash_pipe[new2->hi] = b64[new2->h % 64]; | |||
} | |||
part->fuzzy = new; | |||
part->double_fuzzy = new2; | |||
} | |||
/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ | |||
gint | |||
fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2) | |||
@@ -308,6 +420,22 @@ fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2) | |||
return res; | |||
} | |||
gint | |||
fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2) | |||
{ | |||
if (p1->fuzzy->block_size == p2->fuzzy->block_size) { | |||
return fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy); | |||
} | |||
else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) { | |||
return fuzzy_compare_hashes (p1->double_fuzzy, p2->fuzzy); | |||
} | |||
else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) { | |||
return fuzzy_compare_hashes (p2->double_fuzzy, p1->fuzzy); | |||
} | |||
return 0; | |||
} | |||
/* | |||
* vi:ts=4 | |||
*/ |
@@ -20,6 +20,8 @@ typedef struct fuzzy_hash_s { | |||
guint32 hi; /**< current index in hash pipe */ | |||
} fuzzy_hash_t; | |||
struct mime_text_part; | |||
/** | |||
* Calculate fuzzy hash for specified string | |||
* @param in input string | |||
@@ -28,6 +30,9 @@ typedef struct fuzzy_hash_s { | |||
*/ | |||
fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool); | |||
fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool); | |||
void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool); | |||
gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2); | |||
/** | |||
* Compare score of difference between two hashes |
@@ -687,7 +687,7 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url | |||
gchar tagbuf[128]; | |||
struct html_tag *tag; | |||
gsize len = 0; | |||
gint off, rc; | |||
gint rc; | |||
p = url_text; | |||
while (len < remain) { | |||
@@ -719,7 +719,7 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url | |||
p ++; | |||
} | |||
if (url_try_text (task->task_pool, url_text, len, &off, &url_str) && url_str != NULL) { | |||
if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str) && url_str != NULL) { | |||
new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri)); | |||
if (new != NULL) { | |||
g_strstrip (url_str); | |||
@@ -864,13 +864,10 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i | |||
/* | |||
* Check for phishing | |||
*/ | |||
if ((p = strchr (c, '>')) != NULL ) { | |||
if ((p = strchr (c, '>')) != NULL && id == Tag_A) { | |||
p ++; | |||
check_phishing (task, url, p, remain - (p - tag_text), id); | |||
} | |||
if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) { | |||
g_tree_insert (part->html_urls, url_text, url); | |||
} | |||
if (g_tree_lookup (task->urls, url) == NULL) { | |||
g_tree_insert (task->urls, url, url); | |||
} | |||
@@ -938,7 +935,8 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_ | |||
/* Skip some tags */ | |||
if (data->tag && (data->tag->id == Tag_STYLE || | |||
data->tag->id == Tag_SCRIPT || | |||
data->tag->id == Tag_OBJECT)) { | |||
data->tag->id == Tag_OBJECT || | |||
data->tag->id == Tag_TITLE)) { | |||
return FALSE; | |||
} | |||
} |
@@ -784,9 +784,6 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont | |||
text_part->html_nodes = NULL; | |||
text_part->parent = parent; | |||
text_part->html_urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); | |||
text_part->urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); | |||
text_part->content = strip_html_tags (task, task->task_pool, text_part, text_part->orig, NULL); | |||
if (text_part->html_nodes == NULL) { | |||
@@ -800,10 +797,8 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont | |||
#endif | |||
} | |||
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); | |||
fuzzy_init_part (text_part, task->task_pool); | |||
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content); | |||
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->html_urls); | |||
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->urls); | |||
task->text_parts = g_list_prepend (task->text_parts, text_part); | |||
} | |||
else if (g_mime_content_type_is_type (type, "text", "*")) { | |||
@@ -821,12 +816,9 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont | |||
} | |||
text_part->orig = convert_text_to_utf (task, part_content, type, text_part); | |||
text_part->content = text_part->orig; | |||
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); | |||
text_part->html_urls = NULL; | |||
text_part->urls = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); | |||
url_parse_text (task->task_pool, task, text_part, FALSE); | |||
fuzzy_init_part (text_part, task->task_pool); | |||
task->text_parts = g_list_prepend (task->text_parts, text_part); | |||
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, text_part->urls); | |||
} | |||
} | |||
@@ -973,10 +965,10 @@ process_message (struct worker_task *task) | |||
GMimePart *part; | |||
GMimeDataWrapper *wrapper; | |||
struct received_header *recv; | |||
gchar *mid, *url_str, *p, *end; | |||
gchar *mid, *url_str, *p, *end, *url_end; | |||
struct uri *subject_url; | |||
gsize len; | |||
gint pos, rc; | |||
gint rc; | |||
tmp = memory_pool_alloc (task->task_pool, sizeof (GByteArray)); | |||
tmp->data = task->msg->begin; | |||
@@ -1127,7 +1119,7 @@ process_message (struct worker_task *task) | |||
while (p < end) { | |||
/* Search to the end of url */ | |||
if (url_try_text (task->task_pool, p, end - p, &pos, &url_str)) { | |||
if (url_try_text (task->task_pool, p, end - p, NULL, &url_end, &url_str)) { | |||
if (url_str != NULL) { | |||
subject_url = memory_pool_alloc0 (task->task_pool, sizeof (struct uri)); | |||
if (subject_url != NULL) { | |||
@@ -1150,7 +1142,7 @@ process_message (struct worker_task *task) | |||
else { | |||
break; | |||
} | |||
p += pos; | |||
p = url_end + 1; | |||
} | |||
/* Free header's list */ | |||
g_list_free (cur); |
@@ -30,9 +30,9 @@ struct mime_text_part { | |||
GByteArray *orig; | |||
GByteArray *content; | |||
GNode *html_nodes; | |||
GTree *urls; | |||
GTree *html_urls; | |||
GList *urls_offset; /**< list of offsets of urls */ | |||
fuzzy_hash_t *fuzzy; | |||
fuzzy_hash_t *double_fuzzy; | |||
GMimeObject *parent; | |||
}; | |||
@@ -686,6 +686,7 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) | |||
} | |||
register_fuzzy_call (task, part->fuzzy); | |||
register_fuzzy_call (task, part->double_fuzzy); | |||
cur = g_list_next (cur); | |||
} | |||
@@ -843,6 +844,16 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) | |||
free_task (task, FALSE); | |||
return; | |||
} | |||
if (! register_fuzzy_controller_call (session, task, part->double_fuzzy, cmd, value, flag, saved)) { | |||
/* Cannot write hash */ | |||
session->state = STATE_REPLY; | |||
r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF); | |||
if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { | |||
return; | |||
} | |||
free_task (task, FALSE); | |||
return; | |||
} | |||
cur = g_list_next (cur); | |||
} | |||
/* Process images */ |
@@ -845,31 +845,16 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar | |||
/*XXX: add support of it */ | |||
msg_warn ("numbered matches are not supported for url regexp"); | |||
} | |||
cur = g_list_first (task->text_parts); | |||
while (cur) { | |||
part = (struct mime_text_part *)cur->data; | |||
/* Skip empty parts */ | |||
if (part->is_empty) { | |||
cur = g_list_next (cur); | |||
continue; | |||
} | |||
if (part->is_raw) { | |||
regexp = re->raw_regexp; | |||
} | |||
else { | |||
regexp = re->regexp; | |||
} | |||
callback_param.task = task; | |||
callback_param.regexp = regexp; | |||
callback_param.re = re; | |||
callback_param.found = FALSE; | |||
if (part->urls) { | |||
g_tree_foreach (part->urls, tree_url_callback, &callback_param); | |||
} | |||
if (part->html_urls && callback_param.found == FALSE) { | |||
g_tree_foreach (part->html_urls, tree_url_callback, &callback_param); | |||
} | |||
cur = g_list_next (cur); | |||
regexp = re->regexp; | |||
callback_param.task = task; | |||
callback_param.regexp = regexp; | |||
callback_param.re = re; | |||
callback_param.found = FALSE; | |||
if (task->urls) { | |||
g_tree_foreach (task->urls, tree_url_callback, &callback_param); | |||
} | |||
if (task->emails && callback_param.found == FALSE) { | |||
g_tree_foreach (task->emails, tree_url_callback, &callback_param); | |||
} | |||
if (callback_param.found == FALSE) { | |||
task_cache_add (task, re, 0); |
@@ -216,6 +216,7 @@ rspamd_vsnprintf (gchar *buf, glong max, const gchar *fmt, va_list args) | |||
guint64 ui64; | |||
guint width, sign, hex, max_width, frac_width, i; | |||
f_str_t *v; | |||
GString *gs; | |||
if (max <= 0) { | |||
return buf; | |||
@@ -312,6 +313,15 @@ rspamd_vsnprintf (gchar *buf, glong max, const gchar *fmt, va_list args) | |||
continue; | |||
case 'v': | |||
gs = va_arg (args, GString *); | |||
len = gs->len; | |||
len = (buf + len < last) ? len : (size_t) (last - buf); | |||
buf = ((gchar *)memcpy (buf, gs->str, len)) + len; | |||
fmt++; | |||
break; | |||
case 's': | |||
p = va_arg(args, gchar *); | |||
if (p == NULL) { |
@@ -44,6 +44,7 @@ | |||
* %r rlim_t | |||
* %p void * | |||
* %V f_str_t * | |||
* %v GString * | |||
* %s null-terminated string | |||
* %S ascii null-terminated string | |||
* %*s length and string |
@@ -1157,10 +1157,10 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match | |||
void | |||
url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html) | |||
{ | |||
gint rc, off = 0; | |||
gchar *url_str = NULL; | |||
gint rc; | |||
gchar *url_str = NULL, *url_start, *url_end; | |||
struct uri *new; | |||
const guint8 *p, *end; | |||
gchar *p, *end, *begin; | |||
if (!part->orig->data || part->orig->len == 0) { | |||
@@ -1170,34 +1170,37 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text | |||
if (url_init () == 0) { | |||
if (is_html) { | |||
p = part->orig->data; | |||
end = p + part->orig->len; | |||
begin = part->orig->data; | |||
end = begin + part->orig->len; | |||
p = begin; | |||
} | |||
else { | |||
p = part->content->data; | |||
end = p + part->content->len; | |||
begin = part->content->data; | |||
end = begin + part->content->len; | |||
p = begin; | |||
} | |||
while (p < end) { | |||
if (url_try_text (pool, p, end - p, &off, &url_str)) { | |||
if (url_str != NULL && | |||
g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { | |||
if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) { | |||
if (url_str != NULL) { | |||
new = memory_pool_alloc0 (pool, sizeof (struct uri)); | |||
if (new != NULL) { | |||
g_strstrip (url_str); | |||
rc = parse_uri (new, url_str, pool); | |||
if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && | |||
new->hostlen > 0) { | |||
new->pos = url_start - begin; | |||
new->len = url_end - url_start; | |||
if (new->protocol == PROTOCOL_MAILTO) { | |||
if (!g_tree_lookup (task->emails, new)) { | |||
g_tree_insert (task->emails, new, new); | |||
} | |||
} | |||
else { | |||
g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); | |||
if (!g_tree_lookup (task->urls, new)) { | |||
g_tree_insert (task->urls, new, new); | |||
} | |||
} | |||
part->urls_offset = g_list_prepend (part->urls_offset, new); | |||
} | |||
else if (rc != URI_ERRNO_OK) { | |||
msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); | |||
@@ -1208,13 +1211,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text | |||
else { | |||
break; | |||
} | |||
p += off; | |||
p = url_end + 1; | |||
} | |||
} | |||
/* Handle offsets of this part */ | |||
if (part->urls_offset != NULL) { | |||
part->urls_offset = g_list_reverse (part->urls_offset); | |||
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, part->urls_offset); | |||
} | |||
} | |||
gboolean | |||
url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str) | |||
url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **fin, gchar **url_str) | |||
{ | |||
const gchar *end, *pos; | |||
gint idx, l; | |||
@@ -1247,8 +1255,11 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gch | |||
else { | |||
*url_str = NULL; | |||
} | |||
if (res) { | |||
*res = (pos - begin) + strlen (matcher->pattern); | |||
if (start != NULL) { | |||
*start = (gchar *)pos; | |||
} | |||
if (fin != NULL) { | |||
*fin = (gchar *)pos + m.m_len; | |||
} | |||
return TRUE; | |||
} |
@@ -32,6 +32,9 @@ struct uri { | |||
struct uri *phished_url; | |||
gsize pos; | |||
gsize len; | |||
/* @protocollen should only be usable if @protocol is either | |||
* PROTOCOL_USER or an uri string should be composed. */ | |||
guint protocollen; | |||
@@ -76,7 +79,7 @@ enum protocol { | |||
void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html); | |||
enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool); | |||
gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str); | |||
gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **end, gchar **url_str); | |||
const gchar* url_strerror (enum uri_errno err); | |||
#endif |