]> source.dussan.org Git - rspamd.git/commitdiff
* Add phishing detector (now just compares <a href> with tag's data).
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
src/html.c
src/html.h
src/lua/lua_common.c
src/lua/lua_common.h
src/lua/lua_task.c
src/message.c
src/plugins/lua/phishing.lua [new file with mode: 0644]
src/url.c
src/url.h

index 15f3e9b17f25d6a856a72cd3db7a856bfc247359..8bbea95483f9336734c0c0fb67b84262aece0064 100644 (file)
@@ -468,13 +468,12 @@ entity_cmp_num (const void *m1, const void *m2)
 }
 
 static GNode                   *
-construct_html_node (memory_pool_t * pool, gchar *text)
+construct_html_node (memory_pool_t * pool, gchar *text, gsize tag_len)
 {
        struct html_node               *html;
        GNode                          *n = NULL;
        struct html_tag                 key, *found;
        gchar                           t;
-       gint                            taglen = strlen (text);
 
        if (text == NULL || *text == '\0') {
                return NULL;
@@ -483,7 +482,7 @@ construct_html_node (memory_pool_t * pool, gchar *text)
        html = memory_pool_alloc0 (pool, sizeof (struct html_node));
 
        /* Check whether this tag is fully closed */
-       if (*(text + taglen - 1) == '/') {
+       if (*(text + tag_len - 1) == '/') {
                html->flags |= FL_CLOSED;
        }
 
@@ -660,41 +659,69 @@ decode_entitles (gchar *s, guint * len)
  * Find the first occurrence of find in s, ignore case.
  */
 static gchar *
-html_strcasestr (const gchar *s, const gchar *find)
+html_strncasestr (const gchar *s, const gchar *find, gsize len)
 {
        gchar                           c, sc;
-       size_t len;
+       size_t mlen;
 
        if ((c = *find++) != 0) {
                c = g_ascii_tolower (c);
-               len = strlen (find);
+               mlen = strlen (find);
                do {
                        do {
-                               if ((sc = *s++) == 0)
+                               if ((sc = *s++) == 0 || len -- == 0)
                                        return (NULL);
                        } while (g_ascii_tolower (sc) != c);
-               } while (g_ascii_strncasecmp (s, find, len) != 0);
+               } while (g_ascii_strncasecmp (s, find, mlen) != 0);
                s--;
        }
        return ((gchar *)s);
 }
 
 static void
-parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text)
+check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url_text)
 {
-       gchar                           *c = NULL, *p;
+       struct uri                     *new;
+       gchar                          *url_str;
+       gsize                           len;
+       gint                            off, rc;
+
+       len = strcspn (url_text, "<>");
+
+       if (url_try_text (task->task_pool, url_text, len, &off, &url_str)) {
+               new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri));
+               if (new != NULL) {
+                       g_strstrip (url_str);
+                       rc = parse_uri (new, url_str, task->task_pool);
+                       if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+                               if (g_ascii_strncasecmp (href_url->host, new->host,
+                                               MAX (href_url->hostlen, new->hostlen)) != 0) {
+                                       href_url->is_phished = TRUE;
+                               }
+                       }
+                       else {
+                               msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
+                       }
+               }
+       }
+
+}
+
+static void
+parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text, gsize tag_len)
+{
+       gchar                           *c = NULL, *p, *url_text;
        gint                            len, rc;
-       gchar                           *url_text;
        struct uri                     *url;
        gboolean                        got_single_quote = FALSE, got_double_quote = FALSE;
 
        /* For A tags search for href= and for IMG tags search for src= */
        if (id == Tag_A) {
-               c = html_strcasestr (tag_text, "href=");
+               c = html_strncasestr (tag_text, "href=", tag_len);
                len = sizeof ("href=") - 1;
        }
        else if (id == Tag_IMG) {
-               c = html_strcasestr (tag_text, "src=");
+               c = html_strncasestr (tag_text, "src=", tag_len);
                len = sizeof ("src=") - 1;
        }
 
@@ -707,7 +734,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                }
                len = 0;
                p = c;
-               while (*p) {
+               while (*p && p - tag_text < tag_len) {
                        if (got_double_quote) {
                                if (*p == '"') {
                                        break;
@@ -753,7 +780,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                g_strlcpy (url_text, c, len + 1);
                decode_entitles (url_text, NULL);
 
-               if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0) {
+               if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 &&
+                               g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 &&
+                               g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0) {
                        return;
                }
 
@@ -761,6 +790,12 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                rc = parse_uri (url, url_text, task->task_pool);
 
                if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) {
+                       /*
+                        * Check for phishing
+                        */
+                       if ((p = strchr (c, '>')) != NULL ) {
+                               check_phishing (task, url, p + 1);
+                       }
                        if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
                                g_tree_insert (part->html_urls, url_text, url);
                                task->urls = g_list_prepend (task->urls, url);
@@ -770,7 +805,8 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
 }
 
 gboolean
-add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part, gchar *tag_text, GNode ** cur_level)
+add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part,
+               gchar *tag_text, gsize tag_len, GNode ** cur_level)
 {
        GNode                          *new;
        struct html_node               *data;
@@ -795,17 +831,17 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
                part->html_nodes = new;
                memory_pool_add_destructor (pool, (pool_destruct_func) g_node_destroy, part->html_nodes);
                /* Call once again with root node */
-               return add_html_node (task, pool, part, tag_text, cur_level);
+               return add_html_node (task, pool, part, tag_text, tag_len, cur_level);
        }
        else {
-               new = construct_html_node (pool, tag_text);
+               new = construct_html_node (pool, tag_text, tag_len);
                if (new == NULL) {
                        debug_task ("cannot construct HTML node for text '%s'", tag_text);
                        return -1;
                }
                data = new->data;
                if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
-                       parse_tag_url (task, part, data->tag->id, tag_text);
+                       parse_tag_url (task, part, data->tag->id, tag_text, tag_len);
                }
                if (data->flags & FL_CLOSING) {
                        if (!*cur_level) {
index 048598f3749e2c7630f262591cd66a67f51e691d..0758183925ea9af3e4836df55bb5b5bb179132d9 100644 (file)
@@ -207,7 +207,8 @@ struct html_node {
 /* Forwarded declaration */
 struct worker_task;
 
-gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, gchar *tag_text, GNode **cur_level);
+gboolean add_html_node (struct worker_task *task, memory_pool_t *pool,
+               struct mime_text_part *part, gchar *tag_text, gsize tag_len, GNode **cur_level);
 struct html_tag * get_tag_by_name (const gchar *name);
 void decode_entitles (gchar *s, guint *len);
 
index 38111ea38247f72370d984d3755308a844f2cdb8..9c4cef1130fc2b3c897be2283298031e2a1bb0dd 100644 (file)
@@ -228,6 +228,7 @@ init_lua (struct config_file *cfg)
        (void)luaopen_task (L);
        (void)luaopen_textpart (L);
        (void)luaopen_image (L);
+       (void)luaopen_url (L);
        (void)luaopen_message (L);
        (void)luaopen_classifier (L);
        (void)luaopen_statfile (L);
index ede42562a85474305b3079f12a7087254831fb09..6c51368fbf9b00fc7969d7c30840be09488616d6 100644 (file)
@@ -30,6 +30,7 @@ gint luaopen_hash_table (lua_State *L);
 gint luaopen_trie (lua_State * L);
 gint luaopen_textpart (lua_State *L);
 gint luaopen_image (lua_State *L);
+gint luaopen_url (lua_State *L);
 gint luaopen_classifier (lua_State *L);
 gint luaopen_statfile (lua_State * L);
 void init_lua (struct config_file *cfg);
index 2b19c45be0ca6718db2d83f78254cbb5130af32a..7102940aab6806515a3ab0f9776fa26e3161c8bd 100644 (file)
@@ -122,6 +122,23 @@ static const struct luaL_reg    imagelib_m[] = {
        {NULL, NULL}
 };
 
+/* URL methods */
+LUA_FUNCTION_DEF (url, get_host);
+LUA_FUNCTION_DEF (url, get_user);
+LUA_FUNCTION_DEF (url, get_path);
+LUA_FUNCTION_DEF (url, get_text);
+LUA_FUNCTION_DEF (url, is_phished);
+
+static const struct luaL_reg    urllib_m[] = {
+       LUA_INTERFACE_DEF (url, get_host),
+       LUA_INTERFACE_DEF (url, get_user),
+       LUA_INTERFACE_DEF (url, get_path),
+       LUA_INTERFACE_DEF (url, get_text),
+       LUA_INTERFACE_DEF (url, is_phished),
+       {"__tostring", lua_class_tostring},
+       {NULL, NULL}
+};
+
 /* Utility functions */
 static struct worker_task      *
 lua_check_task (lua_State * L)
@@ -147,8 +164,16 @@ lua_check_image (lua_State * L)
        return *((struct rspamd_image **)ud);
 }
 
+static struct uri      *
+lua_check_url (lua_State * L)
+{
+       void                           *ud = luaL_checkudata (L, 1, "rspamd{url}");
+       luaL_argcheck (L, ud != NULL, 1, "'url' expected");
+       return *((struct uri **)ud);
+}
+
 /*** Task interface    ***/
-static gint
+static int
 lua_task_get_message (lua_State * L)
 {
        GMimeMessage                  **pmsg;
@@ -193,19 +218,24 @@ lua_task_get_urls (lua_State * L)
        gint                            i = 1;
        struct worker_task             *task = lua_check_task (L);
        GList                          *cur;
-       struct uri                     *url;
+       struct uri                    **purl;
 
-       if (task != NULL) {
-               lua_newtable (L);
-               cur = g_list_first (task->urls);
-               while (cur) {
-                       url = cur->data;
-                       lua_pushstring (L, struri (url));
-                       lua_rawseti (L, -2, i++);
-                       cur = g_list_next (cur);
+       if (task) {
+               cur = task->urls;
+               if (cur != NULL) {
+                       lua_newtable (L);
+                       while (cur) {
+                               purl = lua_newuserdata (L, sizeof (struct uri *));
+                               lua_setclass (L, "rspamd{url}", -1);
+                               *purl = cur->data;
+                               lua_rawseti (L, -2, i++);
+                               cur = g_list_next (cur);
+                       }
+                       return 1;
                }
        }
 
+       lua_pushnil (L);
        return 1;
 }
 
@@ -919,6 +949,81 @@ lua_image_get_filename (lua_State *L)
        return 1;
 }
 
+/* URL part */
+static gint
+lua_url_get_host (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushlstring (L, url->host, url->hostlen);
+       }
+       else {
+               lua_pushnil (L);
+       }
+       return 1;
+}
+
+static gint
+lua_url_get_user (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushlstring (L, url->user, url->userlen);
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
+static gint
+lua_url_get_path (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushlstring (L, url->data, url->datalen);
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
+static gint
+lua_url_get_text (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushstring (L, struri (url));
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
+static gint
+lua_url_is_phished (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushboolean (L, url->is_phished);
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
 /* Init part */
 gint
 luaopen_task (lua_State * L)
@@ -946,3 +1051,12 @@ luaopen_image (lua_State * L)
 
        return 1;
 }
+
+gint
+luaopen_url (lua_State * L)
+{
+       lua_newclass (L, "rspamd{url}", urllib_m);
+       luaL_openlib (L, "rspamd_url", null_reg, 0);
+
+       return 1;
+}
index 845386b219dbb1bfa6bb56dcb5a17c7cd87f4eff..60072d45dcfa6ac2beec8ad8db8d76a111af0b1b 100644 (file)
@@ -111,9 +111,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
                        case 1:                 /* HTML/XML */
                                lc = '>';
                                in_q = state = 0;
-                               *p = '\0';
-                               add_html_node (task, pool, part, tbegin, &level_ptr);
-                               *p = '>';
+                               add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
                                break;
 
                        case 2:                 /* PHP */
diff --git a/src/plugins/lua/phishing.lua b/src/plugins/lua/phishing.lua
new file mode 100644 (file)
index 0000000..8021a00
--- /dev/null
@@ -0,0 +1,28 @@
+-- Phishing detection interface for selecting phished urls and inserting corresponding symbol
+--
+--
+local symbol = 'PHISHED_URL'
+
+function phishing_cb (task)
+       local urls = task:get_urls();
+
+       if urls then
+               for _,url in ipairs(urls) do
+                       if url:is_phished() then
+                               task:insert_result(symbol, 1, url:get_host())
+                       end
+               end
+       end
+end
+
+
+local opts =  rspamd_config:get_all_opt('phishing')
+if opts then
+    if opts['symbol'] then
+        symbol = opts['symbol']
+        
+        -- Register symbol's callback
+        rspamd_config:register_symbol(symbol, 1.0, 'phishing_cb')
+    end
+    -- If no symbol defined, do not register this module
+end
index 63c31095ca6b889d8b68aa84e67cb8090b292874..e801527bd71339dfabc7aec99a302cc1a762c9ec 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -160,7 +160,7 @@ enum {
 #define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
 
 
-static const gchar              *
+const gchar              *
 url_strerror (enum uri_errno err)
 {
        switch (err) {
@@ -1147,12 +1147,11 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match
 void
 url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
 {
-       struct url_matcher             *matcher;
-       gint                            rc, idx;
+       gint                            rc, off = 0;
        gchar                           *url_str = NULL;
        struct uri                     *new;
-       const guint8                   *p, *end, *pos;
-       url_match_t                     m;
+       const guint8                   *p, *end;
+
 
        if (!part->orig->data || part->orig->len == 0) {
                msg_warn ("got empty text part");
@@ -1169,37 +1168,61 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
                        end = p + part->content->len;
                }
                while (p < end) {
-                       if ((pos = rspamd_trie_lookup (url_scanner->patterns, p, end - p, &idx)) == NULL) {
-                               break;
-                       }
-                       else {
-                               matcher = &matchers[idx];
-                               m.pattern = matcher->pattern;
-                               m.prefix = matcher->prefix;
-                               if (matcher->start (p, end, pos, &m) && matcher->end (p, end, pos, &m)) {
-                                       url_str = memory_pool_alloc (task->task_pool, m.m_len + 1);
-                                       memcpy (url_str, m.m_begin, m.m_len);
-                                       url_str[m.m_len] = '\0';
-                                       if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
-                                               new = memory_pool_alloc (pool, sizeof (struct uri));
-                                               if (new != NULL) {
-                                                       g_strstrip (url_str);
-                                                       rc = parse_uri (new, url_str, pool);
-                                                       if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
-                                                               g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
-                                                               task->urls = g_list_prepend (task->urls, new);
-                                                       }
-                                                       else {
-                                                               msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
-                                                       }
+                       if (url_try_text (pool, p, end - p, &off, &url_str)) {
+                               if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+                                       new = memory_pool_alloc0 (pool, sizeof (struct uri));
+                                       if (new != NULL) {
+                                               g_strstrip (url_str);
+                                               rc = parse_uri (new, url_str, pool);
+                                               if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+                                                       g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+                                                       task->urls = g_list_prepend (task->urls, new);
+                                               }
+                                               else {
+                                                       msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
                                                }
                                        }
                                }
-                               pos += strlen (matcher->pattern);
                        }
-                       p = pos;
+                       else {
+                               break;
+                       }
+                       p += off;
+               }
+       }
+}
+
+gboolean
+url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str)
+{
+       const gchar                    *end, *pos;
+       gint                            idx;
+       struct url_matcher             *matcher;
+       url_match_t                     m;
+
+       end = begin + len;
+       if (url_init () == 0) {
+               if ((pos = rspamd_trie_lookup (url_scanner->patterns, begin, len, &idx)) == NULL) {
+                       return FALSE;
+               }
+               else {
+                       matcher = &matchers[idx];
+                       m.pattern = matcher->pattern;
+                       m.prefix = matcher->prefix;
+                       if (matcher->start (begin, end, pos, &m) && matcher->end (begin, end, pos, &m)) {
+                               *url_str = memory_pool_alloc (pool, m.m_len + 1);
+                               memcpy (*url_str, m.m_begin, m.m_len);
+                               (*url_str)[m.m_len] = '\0';
+
+                       }
+                       if (res) {
+                               *res = strlen (matcher->pattern);
+                       }
+                       return TRUE;
                }
        }
+
+       return FALSE;
 }
 
 /*
index de7e8fe851cbef3ffa9e5b8c4c16384df6074deb..105466aa592e835af0c26729bfa394dd0a91e1cf 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -41,8 +41,9 @@ struct uri {
        guint fragmentlen;
 
        /* Flags */
-       guint ipv6;     /* URI contains IPv6 host */
-       guint form;     /* URI originated from form */
+       gboolean ipv6;  /* URI contains IPv6 host */
+       gboolean form;  /* URI originated from form */
+       gboolean is_phished; /* URI maybe phishing */
 };
 
 enum uri_errno {
@@ -73,5 +74,7 @@ enum protocol {
 
 void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
 enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool);
+gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str);
+const gchar* url_strerror (enum uri_errno err);
 
 #endif