diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-23 19:05:58 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-23 19:05:58 +0400 |
commit | 5022c0333ffd8ce5eca3dc9e2679b612e2c9ce99 (patch) | |
tree | 4b7fd05c9f01700ea372fb941b87312fceece513 /src/url.c | |
parent | de94e18f57a0dccbab76efb3d574c0485a9e3700 (diff) | |
download | rspamd-5022c0333ffd8ce5eca3dc9e2679b612e2c9ce99.tar.gz rspamd-5022c0333ffd8ce5eca3dc9e2679b612e2c9ce99.zip |
* Fixes to fuzzy hashing logic, skip urls while estimating fuzzy hash
Fix tags stripping.
Fix phishing checks (ignore img tags).
Diffstat (limited to 'src/url.c')
-rw-r--r-- | src/url.c | 41 |
1 files changed, 26 insertions, 15 deletions
@@ -1157,10 +1157,10 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match void url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html) { - gint rc, off = 0; - gchar *url_str = NULL; + gint rc; + gchar *url_str = NULL, *url_start, *url_end; struct uri *new; - const guint8 *p, *end; + gchar *p, *end, *begin; if (!part->orig->data || part->orig->len == 0) { @@ -1170,34 +1170,37 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text if (url_init () == 0) { if (is_html) { - p = part->orig->data; - end = p + part->orig->len; + begin = part->orig->data; + end = begin + part->orig->len; + p = begin; } else { - p = part->content->data; - end = p + part->content->len; + begin = part->content->data; + end = begin + part->content->len; + p = begin; } while (p < end) { - if (url_try_text (pool, p, end - p, &off, &url_str)) { - if (url_str != NULL && - g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { + if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) { + if (url_str != NULL) { new = memory_pool_alloc0 (pool, sizeof (struct uri)); if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, pool); if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && new->hostlen > 0) { + new->pos = url_start - begin; + new->len = url_end - url_start; if (new->protocol == PROTOCOL_MAILTO) { if (!g_tree_lookup (task->emails, new)) { g_tree_insert (task->emails, new, new); } } else { - g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); if (!g_tree_lookup (task->urls, new)) { g_tree_insert (task->urls, new, new); } } + part->urls_offset = g_list_prepend (part->urls_offset, new); } else if (rc != URI_ERRNO_OK) { msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); @@ -1208,13 +1211,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text else { break; } - p += off; + p = url_end + 1; } } + /* Handle offsets of this part */ + if (part->urls_offset != NULL) { + part->urls_offset = g_list_reverse (part->urls_offset); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, part->urls_offset); + } } gboolean -url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str) +url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **fin, gchar **url_str) { const gchar *end, *pos; gint idx, l; @@ -1247,8 +1255,11 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gch else { *url_str = NULL; } - if (res) { - *res = (pos - begin) + strlen (matcher->pattern); + if (start != NULL) { + *start = (gchar *)pos; + } + if (fin != NULL) { + *fin = (gchar *)pos + m.m_len; } return TRUE; } |