From: Vsevolod Stakhov Date: Tue, 2 Nov 2010 15:51:18 +0000 (+0300) Subject: * Add phishing detector (now just compares with tag's data). X-Git-Tag: 0.3.3~3 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=4cba5cf8133e8ecef7953416a4e540b1cf533342;p=rspamd.git * Add phishing detector (now just compares with tag's data). --- diff --git a/src/html.c b/src/html.c index 15f3e9b17..8bbea9548 100644 --- a/src/html.c +++ b/src/html.c @@ -468,13 +468,12 @@ entity_cmp_num (const void *m1, const void *m2) } static GNode * -construct_html_node (memory_pool_t * pool, gchar *text) +construct_html_node (memory_pool_t * pool, gchar *text, gsize tag_len) { struct html_node *html; GNode *n = NULL; struct html_tag key, *found; gchar t; - gint taglen = strlen (text); if (text == NULL || *text == '\0') { return NULL; @@ -483,7 +482,7 @@ construct_html_node (memory_pool_t * pool, gchar *text) html = memory_pool_alloc0 (pool, sizeof (struct html_node)); /* Check whether this tag is fully closed */ - if (*(text + taglen - 1) == '/') { + if (*(text + tag_len - 1) == '/') { html->flags |= FL_CLOSED; } @@ -660,41 +659,69 @@ decode_entitles (gchar *s, guint * len) * Find the first occurrence of find in s, ignore case. */ static gchar * -html_strcasestr (const gchar *s, const gchar *find) +html_strncasestr (const gchar *s, const gchar *find, gsize len) { gchar c, sc; - size_t len; + size_t mlen; if ((c = *find++) != 0) { c = g_ascii_tolower (c); - len = strlen (find); + mlen = strlen (find); do { do { - if ((sc = *s++) == 0) + if ((sc = *s++) == 0 || len -- == 0) return (NULL); } while (g_ascii_tolower (sc) != c); - } while (g_ascii_strncasecmp (s, find, len) != 0); + } while (g_ascii_strncasecmp (s, find, mlen) != 0); s--; } return ((gchar *)s); } static void -parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text) +check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url_text) { - gchar *c = NULL, *p; + struct uri *new; + gchar *url_str; + gsize len; + gint off, rc; + + len = strcspn (url_text, "<>"); + + if (url_try_text (task->task_pool, url_text, len, &off, &url_str)) { + new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri)); + if (new != NULL) { + g_strstrip (url_str); + rc = parse_uri (new, url_str, task->task_pool); + if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { + if (g_ascii_strncasecmp (href_url->host, new->host, + MAX (href_url->hostlen, new->hostlen)) != 0) { + href_url->is_phished = TRUE; + } + } + else { + msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); + } + } + } + +} + +static void +parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text, gsize tag_len) +{ + gchar *c = NULL, *p, *url_text; gint len, rc; - gchar *url_text; struct uri *url; gboolean got_single_quote = FALSE, got_double_quote = FALSE; /* For A tags search for href= and for IMG tags search for src= */ if (id == Tag_A) { - c = html_strcasestr (tag_text, "href="); + c = html_strncasestr (tag_text, "href=", tag_len); len = sizeof ("href=") - 1; } else if (id == Tag_IMG) { - c = html_strcasestr (tag_text, "src="); + c = html_strncasestr (tag_text, "src=", tag_len); len = sizeof ("src=") - 1; } @@ -707,7 +734,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i } len = 0; p = c; - while (*p) { + while (*p && p - tag_text < tag_len) { if (got_double_quote) { if (*p == '"') { break; @@ -753,7 +780,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i g_strlcpy (url_text, c, len + 1); decode_entitles (url_text, NULL); - if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0) { + if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 && + g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 && + g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0) { return; } @@ -761,6 +790,12 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i rc = parse_uri (url, url_text, task->task_pool); if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) { + /* + * Check for phishing + */ + if ((p = strchr (c, '>')) != NULL ) { + check_phishing (task, url, p + 1); + } if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) { g_tree_insert (part->html_urls, url_text, url); task->urls = g_list_prepend (task->urls, url); @@ -770,7 +805,8 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i } gboolean -add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part, gchar *tag_text, GNode ** cur_level) +add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part, + gchar *tag_text, gsize tag_len, GNode ** cur_level) { GNode *new; struct html_node *data; @@ -795,17 +831,17 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_ part->html_nodes = new; memory_pool_add_destructor (pool, (pool_destruct_func) g_node_destroy, part->html_nodes); /* Call once again with root node */ - return add_html_node (task, pool, part, tag_text, cur_level); + return add_html_node (task, pool, part, tag_text, tag_len, cur_level); } else { - new = construct_html_node (pool, tag_text); + new = construct_html_node (pool, tag_text, tag_len); if (new == NULL) { debug_task ("cannot construct HTML node for text '%s'", tag_text); return -1; } data = new->data; if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { - parse_tag_url (task, part, data->tag->id, tag_text); + parse_tag_url (task, part, data->tag->id, tag_text, tag_len); } if (data->flags & FL_CLOSING) { if (!*cur_level) { diff --git a/src/html.h b/src/html.h index 048598f37..075818392 100644 --- a/src/html.h +++ b/src/html.h @@ -207,7 +207,8 @@ struct html_node { /* Forwarded declaration */ struct worker_task; -gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, gchar *tag_text, GNode **cur_level); +gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, + struct mime_text_part *part, gchar *tag_text, gsize tag_len, GNode **cur_level); struct html_tag * get_tag_by_name (const gchar *name); void decode_entitles (gchar *s, guint *len); diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index 38111ea38..9c4cef113 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -228,6 +228,7 @@ init_lua (struct config_file *cfg) (void)luaopen_task (L); (void)luaopen_textpart (L); (void)luaopen_image (L); + (void)luaopen_url (L); (void)luaopen_message (L); (void)luaopen_classifier (L); (void)luaopen_statfile (L); diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index ede42562a..6c51368fb 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -30,6 +30,7 @@ gint luaopen_hash_table (lua_State *L); gint luaopen_trie (lua_State * L); gint luaopen_textpart (lua_State *L); gint luaopen_image (lua_State *L); +gint luaopen_url (lua_State *L); gint luaopen_classifier (lua_State *L); gint luaopen_statfile (lua_State * L); void init_lua (struct config_file *cfg); diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 2b19c45be..7102940aa 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -122,6 +122,23 @@ static const struct luaL_reg imagelib_m[] = { {NULL, NULL} }; +/* URL methods */ +LUA_FUNCTION_DEF (url, get_host); +LUA_FUNCTION_DEF (url, get_user); +LUA_FUNCTION_DEF (url, get_path); +LUA_FUNCTION_DEF (url, get_text); +LUA_FUNCTION_DEF (url, is_phished); + +static const struct luaL_reg urllib_m[] = { + LUA_INTERFACE_DEF (url, get_host), + LUA_INTERFACE_DEF (url, get_user), + LUA_INTERFACE_DEF (url, get_path), + LUA_INTERFACE_DEF (url, get_text), + LUA_INTERFACE_DEF (url, is_phished), + {"__tostring", lua_class_tostring}, + {NULL, NULL} +}; + /* Utility functions */ static struct worker_task * lua_check_task (lua_State * L) @@ -147,8 +164,16 @@ lua_check_image (lua_State * L) return *((struct rspamd_image **)ud); } +static struct uri * +lua_check_url (lua_State * L) +{ + void *ud = luaL_checkudata (L, 1, "rspamd{url}"); + luaL_argcheck (L, ud != NULL, 1, "'url' expected"); + return *((struct uri **)ud); +} + /*** Task interface ***/ -static gint +static int lua_task_get_message (lua_State * L) { GMimeMessage **pmsg; @@ -193,19 +218,24 @@ lua_task_get_urls (lua_State * L) gint i = 1; struct worker_task *task = lua_check_task (L); GList *cur; - struct uri *url; + struct uri **purl; - if (task != NULL) { - lua_newtable (L); - cur = g_list_first (task->urls); - while (cur) { - url = cur->data; - lua_pushstring (L, struri (url)); - lua_rawseti (L, -2, i++); - cur = g_list_next (cur); + if (task) { + cur = task->urls; + if (cur != NULL) { + lua_newtable (L); + while (cur) { + purl = lua_newuserdata (L, sizeof (struct uri *)); + lua_setclass (L, "rspamd{url}", -1); + *purl = cur->data; + lua_rawseti (L, -2, i++); + cur = g_list_next (cur); + } + return 1; } } + lua_pushnil (L); return 1; } @@ -919,6 +949,81 @@ lua_image_get_filename (lua_State *L) return 1; } +/* URL part */ +static gint +lua_url_get_host (lua_State *L) +{ + struct uri *url = lua_check_url (L); + + if (url != NULL) { + lua_pushlstring (L, url->host, url->hostlen); + } + else { + lua_pushnil (L); + } + return 1; +} + +static gint +lua_url_get_user (lua_State *L) +{ + struct uri *url = lua_check_url (L); + + if (url != NULL) { + lua_pushlstring (L, url->user, url->userlen); + } + else { + lua_pushnil (L); + } + + return 1; +} + +static gint +lua_url_get_path (lua_State *L) +{ + struct uri *url = lua_check_url (L); + + if (url != NULL) { + lua_pushlstring (L, url->data, url->datalen); + } + else { + lua_pushnil (L); + } + + return 1; +} + +static gint +lua_url_get_text (lua_State *L) +{ + struct uri *url = lua_check_url (L); + + if (url != NULL) { + lua_pushstring (L, struri (url)); + } + else { + lua_pushnil (L); + } + + return 1; +} + +static gint +lua_url_is_phished (lua_State *L) +{ + struct uri *url = lua_check_url (L); + + if (url != NULL) { + lua_pushboolean (L, url->is_phished); + } + else { + lua_pushnil (L); + } + + return 1; +} + /* Init part */ gint luaopen_task (lua_State * L) @@ -946,3 +1051,12 @@ luaopen_image (lua_State * L) return 1; } + +gint +luaopen_url (lua_State * L) +{ + lua_newclass (L, "rspamd{url}", urllib_m); + luaL_openlib (L, "rspamd_url", null_reg, 0); + + return 1; +} diff --git a/src/message.c b/src/message.c index 845386b21..60072d45d 100644 --- a/src/message.c +++ b/src/message.c @@ -111,9 +111,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex case 1: /* HTML/XML */ lc = '>'; in_q = state = 0; - *p = '\0'; - add_html_node (task, pool, part, tbegin, &level_ptr); - *p = '>'; + add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr); break; case 2: /* PHP */ diff --git a/src/plugins/lua/phishing.lua b/src/plugins/lua/phishing.lua new file mode 100644 index 000000000..8021a0069 --- /dev/null +++ b/src/plugins/lua/phishing.lua @@ -0,0 +1,28 @@ +-- Phishing detection interface for selecting phished urls and inserting corresponding symbol +-- +-- +local symbol = 'PHISHED_URL' + +function phishing_cb (task) + local urls = task:get_urls(); + + if urls then + for _,url in ipairs(urls) do + if url:is_phished() then + task:insert_result(symbol, 1, url:get_host()) + end + end + end +end + + +local opts = rspamd_config:get_all_opt('phishing') +if opts then + if opts['symbol'] then + symbol = opts['symbol'] + + -- Register symbol's callback + rspamd_config:register_symbol(symbol, 1.0, 'phishing_cb') + end + -- If no symbol defined, do not register this module +end diff --git a/src/url.c b/src/url.c index 63c31095c..e801527bd 100644 --- a/src/url.c +++ b/src/url.c @@ -160,7 +160,7 @@ enum { #define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) -static const gchar * +const gchar * url_strerror (enum uri_errno err) { switch (err) { @@ -1147,12 +1147,11 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match void url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html) { - struct url_matcher *matcher; - gint rc, idx; + gint rc, off = 0; gchar *url_str = NULL; struct uri *new; - const guint8 *p, *end, *pos; - url_match_t m; + const guint8 *p, *end; + if (!part->orig->data || part->orig->len == 0) { msg_warn ("got empty text part"); @@ -1169,37 +1168,61 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text end = p + part->content->len; } while (p < end) { - if ((pos = rspamd_trie_lookup (url_scanner->patterns, p, end - p, &idx)) == NULL) { - break; - } - else { - matcher = &matchers[idx]; - m.pattern = matcher->pattern; - m.prefix = matcher->prefix; - if (matcher->start (p, end, pos, &m) && matcher->end (p, end, pos, &m)) { - url_str = memory_pool_alloc (task->task_pool, m.m_len + 1); - memcpy (url_str, m.m_begin, m.m_len); - url_str[m.m_len] = '\0'; - if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { - new = memory_pool_alloc (pool, sizeof (struct uri)); - if (new != NULL) { - g_strstrip (url_str); - rc = parse_uri (new, url_str, pool); - if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { - g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); - task->urls = g_list_prepend (task->urls, new); - } - else { - msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); - } + if (url_try_text (pool, p, end - p, &off, &url_str)) { + if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { + new = memory_pool_alloc0 (pool, sizeof (struct uri)); + if (new != NULL) { + g_strstrip (url_str); + rc = parse_uri (new, url_str, pool); + if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { + g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); + task->urls = g_list_prepend (task->urls, new); + } + else { + msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); } } } - pos += strlen (matcher->pattern); } - p = pos; + else { + break; + } + p += off; + } + } +} + +gboolean +url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str) +{ + const gchar *end, *pos; + gint idx; + struct url_matcher *matcher; + url_match_t m; + + end = begin + len; + if (url_init () == 0) { + if ((pos = rspamd_trie_lookup (url_scanner->patterns, begin, len, &idx)) == NULL) { + return FALSE; + } + else { + matcher = &matchers[idx]; + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + if (matcher->start (begin, end, pos, &m) && matcher->end (begin, end, pos, &m)) { + *url_str = memory_pool_alloc (pool, m.m_len + 1); + memcpy (*url_str, m.m_begin, m.m_len); + (*url_str)[m.m_len] = '\0'; + + } + if (res) { + *res = strlen (matcher->pattern); + } + return TRUE; } } + + return FALSE; } /* diff --git a/src/url.h b/src/url.h index de7e8fe85..105466aa5 100644 --- a/src/url.h +++ b/src/url.h @@ -41,8 +41,9 @@ struct uri { guint fragmentlen; /* Flags */ - guint ipv6; /* URI contains IPv6 host */ - guint form; /* URI originated from form */ + gboolean ipv6; /* URI contains IPv6 host */ + gboolean form; /* URI originated from form */ + gboolean is_phished; /* URI maybe phishing */ }; enum uri_errno { @@ -73,5 +74,7 @@ enum protocol { void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html); enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool); +gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str); +const gchar* url_strerror (enum uri_errno err); #endif