* Add phishing detector (now just compares <a href> with tag's data).

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
diff --git a/src/html.c b/src/html.c

index 15f3e9b17f25d6a856a72cd3db7a856bfc247359..8bbea95483f9336734c0c0fb67b84262aece0064 100644 (file)
--- a/src/html.c
+++ b/src/html.c
@@ -468,13 +468,12 @@ entity_cmp_num (const void *m1, const void *m2)
  }
  
  static GNode                   *
-construct_html_node (memory_pool_t * pool, gchar *text)
+construct_html_node (memory_pool_t * pool, gchar *text, gsize tag_len)
  {
         struct html_node               *html;
         GNode                          *n = NULL;
         struct html_tag                 key, *found;
         gchar                           t;
-       gint                            taglen = strlen (text);
  
         if (text == NULL || *text == '\0') {
                 return NULL;
@@ -483,7 +482,7 @@ construct_html_node (memory_pool_t * pool, gchar *text)
         html = memory_pool_alloc0 (pool, sizeof (struct html_node));
  
         /* Check whether this tag is fully closed */
-       if (*(text + taglen - 1) == '/') {
+       if (*(text + tag_len - 1) == '/') {
                 html->flags |= FL_CLOSED;
         }
  
@@ -660,41 +659,69 @@ decode_entitles (gchar *s, guint * len)
   * Find the first occurrence of find in s, ignore case.
   */
  static gchar *
-html_strcasestr (const gchar *s, const gchar *find)
+html_strncasestr (const gchar *s, const gchar *find, gsize len)
  {
         gchar                           c, sc;
-       size_t len;
+       size_t mlen;
  
         if ((c = *find++) != 0) {
                 c = g_ascii_tolower (c);
-               len = strlen (find);
+               mlen = strlen (find);
                 do {
                         do {
-                               if ((sc = *s++) == 0)
+                               if ((sc = *s++) == 0 || len -- == 0)
                                         return (NULL);
                         } while (g_ascii_tolower (sc) != c);
-               } while (g_ascii_strncasecmp (s, find, len) != 0);
+               } while (g_ascii_strncasecmp (s, find, mlen) != 0);
                 s--;
         }
         return ((gchar *)s);
  }
  
  static void
-parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text)
+check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url_text)
  {
-       gchar                           *c = NULL, *p;
+       struct uri                     *new;
+       gchar                          *url_str;
+       gsize                           len;
+       gint                            off, rc;
+
+       len = strcspn (url_text, "<>");
+
+       if (url_try_text (task->task_pool, url_text, len, &off, &url_str)) {
+               new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri));
+               if (new != NULL) {
+                       g_strstrip (url_str);
+                       rc = parse_uri (new, url_str, task->task_pool);
+                       if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+                               if (g_ascii_strncasecmp (href_url->host, new->host,
+                                               MAX (href_url->hostlen, new->hostlen)) != 0) {
+                                       href_url->is_phished = TRUE;
+                               }
+                       }
+                       else {
+                               msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
+                       }
+               }
+       }
+
+}
+
+static void
+parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text, gsize tag_len)
+{
+       gchar                           *c = NULL, *p, *url_text;
         gint                            len, rc;
-       gchar                           *url_text;
         struct uri                     *url;
         gboolean                        got_single_quote = FALSE, got_double_quote = FALSE;
  
         /* For A tags search for href= and for IMG tags search for src= */
         if (id == Tag_A) {
-               c = html_strcasestr (tag_text, "href=");
+               c = html_strncasestr (tag_text, "href=", tag_len);
                 len = sizeof ("href=") - 1;
         }
         else if (id == Tag_IMG) {
-               c = html_strcasestr (tag_text, "src=");
+               c = html_strncasestr (tag_text, "src=", tag_len);
                 len = sizeof ("src=") - 1;
         }
  
@@ -707,7 +734,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                 }
                 len = 0;
                 p = c;
-               while (*p) {
+               while (*p && p - tag_text < tag_len) {
                         if (got_double_quote) {
                                 if (*p == '"') {
                                         break;
@@ -753,7 +780,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                 g_strlcpy (url_text, c, len + 1);
                 decode_entitles (url_text, NULL);
  
-               if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0) {
+               if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 &&
+                               g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 &&
+                               g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0) {
                         return;
                 }
  
@@ -761,6 +790,12 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                 rc = parse_uri (url, url_text, task->task_pool);
  
                 if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) {
+                       /*
+                        * Check for phishing
+                        */
+                       if ((p = strchr (c, '>')) != NULL ) {
+                               check_phishing (task, url, p + 1);
+                       }
                         if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
                                 g_tree_insert (part->html_urls, url_text, url);
                                 task->urls = g_list_prepend (task->urls, url);
@@ -770,7 +805,8 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
  }
  
  gboolean
-add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part, gchar *tag_text, GNode ** cur_level)
+add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part,
+               gchar *tag_text, gsize tag_len, GNode ** cur_level)
  {
         GNode                          *new;
         struct html_node               *data;
@@ -795,17 +831,17 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
                 part->html_nodes = new;
                 memory_pool_add_destructor (pool, (pool_destruct_func) g_node_destroy, part->html_nodes);
                 /* Call once again with root node */
-               return add_html_node (task, pool, part, tag_text, cur_level);
+               return add_html_node (task, pool, part, tag_text, tag_len, cur_level);
         }
         else {
-               new = construct_html_node (pool, tag_text);
+               new = construct_html_node (pool, tag_text, tag_len);
                 if (new == NULL) {
                         debug_task ("cannot construct HTML node for text '%s'", tag_text);
                         return -1;
                 }
                 data = new->data;
                 if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
-                       parse_tag_url (task, part, data->tag->id, tag_text);
+                       parse_tag_url (task, part, data->tag->id, tag_text, tag_len);
                 }
                 if (data->flags & FL_CLOSING) {
                         if (!*cur_level) {
diff --git a/src/html.h b/src/html.h

index 048598f3749e2c7630f262591cd66a67f51e691d..0758183925ea9af3e4836df55bb5b5bb179132d9 100644 (file)
--- a/src/html.h
+++ b/src/html.h
@@ -207,7 +207,8 @@ struct html_node {
  /* Forwarded declaration */
  struct worker_task;
  
-gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, gchar *tag_text, GNode **cur_level);
+gboolean add_html_node (struct worker_task *task, memory_pool_t *pool,
+               struct mime_text_part *part, gchar *tag_text, gsize tag_len, GNode **cur_level);
  struct html_tag * get_tag_by_name (const gchar *name);
  void decode_entitles (gchar *s, guint *len);
  
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c

index 38111ea38247f72370d984d3755308a844f2cdb8..9c4cef1130fc2b3c897be2283298031e2a1bb0dd 100644 (file)
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -228,6 +228,7 @@ init_lua (struct config_file *cfg)
         (void)luaopen_task (L);
         (void)luaopen_textpart (L);
         (void)luaopen_image (L);
+       (void)luaopen_url (L);
         (void)luaopen_message (L);
         (void)luaopen_classifier (L);
         (void)luaopen_statfile (L);
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h

index ede42562a85474305b3079f12a7087254831fb09..6c51368fbf9b00fc7969d7c30840be09488616d6 100644 (file)
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -30,6 +30,7 @@ gint luaopen_hash_table (lua_State *L);
  gint luaopen_trie (lua_State * L);
  gint luaopen_textpart (lua_State *L);
  gint luaopen_image (lua_State *L);
+gint luaopen_url (lua_State *L);
  gint luaopen_classifier (lua_State *L);
  gint luaopen_statfile (lua_State * L);
  void init_lua (struct config_file *cfg);
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c

index 2b19c45be0ca6718db2d83f78254cbb5130af32a..7102940aab6806515a3ab0f9776fa26e3161c8bd 100644 (file)
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -122,6 +122,23 @@ static const struct luaL_reg    imagelib_m[] = {
         {NULL, NULL}
  };
  
+/* URL methods */
+LUA_FUNCTION_DEF (url, get_host);
+LUA_FUNCTION_DEF (url, get_user);
+LUA_FUNCTION_DEF (url, get_path);
+LUA_FUNCTION_DEF (url, get_text);
+LUA_FUNCTION_DEF (url, is_phished);
+
+static const struct luaL_reg    urllib_m[] = {
+       LUA_INTERFACE_DEF (url, get_host),
+       LUA_INTERFACE_DEF (url, get_user),
+       LUA_INTERFACE_DEF (url, get_path),
+       LUA_INTERFACE_DEF (url, get_text),
+       LUA_INTERFACE_DEF (url, is_phished),
+       {"__tostring", lua_class_tostring},
+       {NULL, NULL}
+};
+
  /* Utility functions */
  static struct worker_task      *
  lua_check_task (lua_State * L)
@@ -147,8 +164,16 @@ lua_check_image (lua_State * L)
         return *((struct rspamd_image **)ud);
  }
  
+static struct uri      *
+lua_check_url (lua_State * L)
+{
+       void                           *ud = luaL_checkudata (L, 1, "rspamd{url}");
+       luaL_argcheck (L, ud != NULL, 1, "'url' expected");
+       return *((struct uri **)ud);
+}
+
  /*** Task interface    ***/
-static gint
+static int
  lua_task_get_message (lua_State * L)
  {
         GMimeMessage                  **pmsg;
@@ -193,19 +218,24 @@ lua_task_get_urls (lua_State * L)
         gint                            i = 1;
         struct worker_task             *task = lua_check_task (L);
         GList                          *cur;
-       struct uri                     *url;
+       struct uri                    **purl;
  
-       if (task != NULL) {
-               lua_newtable (L);
-               cur = g_list_first (task->urls);
-               while (cur) {
-                       url = cur->data;
-                       lua_pushstring (L, struri (url));
-                       lua_rawseti (L, -2, i++);
-                       cur = g_list_next (cur);
+       if (task) {
+               cur = task->urls;
+               if (cur != NULL) {
+                       lua_newtable (L);
+                       while (cur) {
+                               purl = lua_newuserdata (L, sizeof (struct uri *));
+                               lua_setclass (L, "rspamd{url}", -1);
+                               *purl = cur->data;
+                               lua_rawseti (L, -2, i++);
+                               cur = g_list_next (cur);
+                       }
+                       return 1;
                 }
         }
  
+       lua_pushnil (L);
         return 1;
  }
  
@@ -919,6 +949,81 @@ lua_image_get_filename (lua_State *L)
         return 1;
  }
  
+/* URL part */
+static gint
+lua_url_get_host (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushlstring (L, url->host, url->hostlen);
+       }
+       else {
+               lua_pushnil (L);
+       }
+       return 1;
+}
+
+static gint
+lua_url_get_user (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushlstring (L, url->user, url->userlen);
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
+static gint
+lua_url_get_path (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushlstring (L, url->data, url->datalen);
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
+static gint
+lua_url_get_text (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushstring (L, struri (url));
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
+static gint
+lua_url_is_phished (lua_State *L)
+{
+       struct uri                      *url = lua_check_url (L);
+
+       if (url != NULL) {
+               lua_pushboolean (L, url->is_phished);
+       }
+       else {
+               lua_pushnil (L);
+       }
+
+       return 1;
+}
+
  /* Init part */
  gint
  luaopen_task (lua_State * L)
@@ -946,3 +1051,12 @@ luaopen_image (lua_State * L)
  
         return 1;
  }
+
+gint
+luaopen_url (lua_State * L)
+{
+       lua_newclass (L, "rspamd{url}", urllib_m);
+       luaL_openlib (L, "rspamd_url", null_reg, 0);
+
+       return 1;
+}
diff --git a/src/message.c b/src/message.c

index 845386b219dbb1bfa6bb56dcb5a17c7cd87f4eff..60072d45dcfa6ac2beec8ad8db8d76a111af0b1b 100644 (file)
--- a/src/message.c
+++ b/src/message.c
@@ -111,9 +111,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
                         case 1:                 /* HTML/XML */
                                 lc = '>';
                                 in_q = state = 0;
-                               *p = '\0';
-                               add_html_node (task, pool, part, tbegin, &level_ptr);
-                               *p = '>';
+                               add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
                                 break;
  
                         case 2:                 /* PHP */
diff --git a/src/plugins/lua/phishing.lua b/src/plugins/lua/phishing.lua

new file mode 100644 (file)

index 0000000..8021a00
--- /dev/null
+++ b/src/plugins/lua/phishing.lua
@@ -0,0 +1,28 @@
+-- Phishing detection interface for selecting phished urls and inserting corresponding symbol
+--
+--
+local symbol = 'PHISHED_URL'
+
+function phishing_cb (task)
+       local urls = task:get_urls();
+
+       if urls then
+               for _,url in ipairs(urls) do
+                       if url:is_phished() then
+                               task:insert_result(symbol, 1, url:get_host())
+                       end
+               end
+       end
+end
+
+
+local opts =  rspamd_config:get_all_opt('phishing')
+if opts then
+    if opts['symbol'] then
+        symbol = opts['symbol']
+        
+        -- Register symbol's callback
+        rspamd_config:register_symbol(symbol, 1.0, 'phishing_cb')
+    end
+    -- If no symbol defined, do not register this module
+end
diff --git a/src/url.c b/src/url.c

index 63c31095ca6b889d8b68aa84e67cb8090b292874..e801527bd71339dfabc7aec99a302cc1a762c9ec 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -160,7 +160,7 @@ enum {
  #define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
  
  
-static const gchar              *
+const gchar              *
  url_strerror (enum uri_errno err)
  {
         switch (err) {
@@ -1147,12 +1147,11 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match
  void
  url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
  {
-       struct url_matcher             *matcher;
-       gint                            rc, idx;
+       gint                            rc, off = 0;
         gchar                           *url_str = NULL;
         struct uri                     *new;
-       const guint8                   *p, *end, *pos;
-       url_match_t                     m;
+       const guint8                   *p, *end;
+
  
         if (!part->orig->data || part->orig->len == 0) {
                 msg_warn ("got empty text part");
@@ -1169,37 +1168,61 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
                         end = p + part->content->len;
                 }
                 while (p < end) {
-                       if ((pos = rspamd_trie_lookup (url_scanner->patterns, p, end - p, &idx)) == NULL) {
-                               break;
-                       }
-                       else {
-                               matcher = &matchers[idx];
-                               m.pattern = matcher->pattern;
-                               m.prefix = matcher->prefix;
-                               if (matcher->start (p, end, pos, &m) && matcher->end (p, end, pos, &m)) {
-                                       url_str = memory_pool_alloc (task->task_pool, m.m_len + 1);
-                                       memcpy (url_str, m.m_begin, m.m_len);
-                                       url_str[m.m_len] = '\0';
-                                       if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
-                                               new = memory_pool_alloc (pool, sizeof (struct uri));
-                                               if (new != NULL) {
-                                                       g_strstrip (url_str);
-                                                       rc = parse_uri (new, url_str, pool);
-                                                       if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
-                                                               g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
-                                                               task->urls = g_list_prepend (task->urls, new);
-                                                       }
-                                                       else {
-                                                               msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
-                                                       }
+                       if (url_try_text (pool, p, end - p, &off, &url_str)) {
+                               if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+                                       new = memory_pool_alloc0 (pool, sizeof (struct uri));
+                                       if (new != NULL) {
+                                               g_strstrip (url_str);
+                                               rc = parse_uri (new, url_str, pool);
+                                               if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+                                                       g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+                                                       task->urls = g_list_prepend (task->urls, new);
+                                               }
+                                               else {
+                                                       msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
                                                 }
                                         }
                                 }
-                               pos += strlen (matcher->pattern);
                         }
-                       p = pos;
+                       else {
+                               break;
+                       }
+                       p += off;
+               }
+       }
+}
+
+gboolean
+url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str)
+{
+       const gchar                    *end, *pos;
+       gint                            idx;
+       struct url_matcher             *matcher;
+       url_match_t                     m;
+
+       end = begin + len;
+       if (url_init () == 0) {
+               if ((pos = rspamd_trie_lookup (url_scanner->patterns, begin, len, &idx)) == NULL) {
+                       return FALSE;
+               }
+               else {
+                       matcher = &matchers[idx];
+                       m.pattern = matcher->pattern;
+                       m.prefix = matcher->prefix;
+                       if (matcher->start (begin, end, pos, &m) && matcher->end (begin, end, pos, &m)) {
+                               *url_str = memory_pool_alloc (pool, m.m_len + 1);
+                               memcpy (*url_str, m.m_begin, m.m_len);
+                               (*url_str)[m.m_len] = '\0';
+
+                       }
+                       if (res) {
+                               *res = strlen (matcher->pattern);
+                       }
+                       return TRUE;
                 }
         }
+
+       return FALSE;
  }
  
  /*
diff --git a/src/url.h b/src/url.h

index de7e8fe851cbef3ffa9e5b8c4c16384df6074deb..105466aa592e835af0c26729bfa394dd0a91e1cf 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -41,8 +41,9 @@ struct uri {
         guint fragmentlen;
  
         /* Flags */
-       guint ipv6;     /* URI contains IPv6 host */
-       guint form;     /* URI originated from form */
+       gboolean ipv6;  /* URI contains IPv6 host */
+       gboolean form;  /* URI originated from form */
+       gboolean is_phished; /* URI maybe phishing */
  };
  
  enum uri_errno {
@@ -73,5 +74,7 @@ enum protocol {
  
  void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
  enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool);
+gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str);
+const gchar* url_strerror (enum uri_errno err);
  
  #endif
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Tue, 2 Nov 2010 15:51:18 +0000 (18:51 +0300)
src/html.c		patch \| blob \| history
src/html.h		patch \| blob \| history
src/lua/lua_common.c		patch \| blob \| history
src/lua/lua_common.h		patch \| blob \| history
src/lua/lua_task.c		patch \| blob \| history
src/message.c		patch \| blob \| history
src/plugins/lua/phishing.lua	[new file with mode: 0644]	patch \| blob
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history