summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2010-11-02 18:51:18 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2010-11-02 18:51:18 +0300
commit4cba5cf8133e8ecef7953416a4e540b1cf533342 (patch)
treefb3570511f6dea667b7c132117aaaeca1521ae65
parent1ce9ce321a4d4076504adfbf2503364768d5124f (diff)
downloadrspamd-4cba5cf8133e8ecef7953416a4e540b1cf533342.tar.gz
rspamd-4cba5cf8133e8ecef7953416a4e540b1cf533342.zip
* Add phishing detector (now just compares <a href> with tag's data).
-rw-r--r--src/html.c74
-rw-r--r--src/html.h3
-rw-r--r--src/lua/lua_common.c1
-rw-r--r--src/lua/lua_common.h1
-rw-r--r--src/lua/lua_task.c134
-rw-r--r--src/message.c4
-rw-r--r--src/plugins/lua/phishing.lua28
-rw-r--r--src/url.c83
-rw-r--r--src/url.h7
9 files changed, 270 insertions, 65 deletions
diff --git a/src/html.c b/src/html.c
index 15f3e9b17..8bbea9548 100644
--- a/src/html.c
+++ b/src/html.c
@@ -468,13 +468,12 @@ entity_cmp_num (const void *m1, const void *m2)
}
static GNode *
-construct_html_node (memory_pool_t * pool, gchar *text)
+construct_html_node (memory_pool_t * pool, gchar *text, gsize tag_len)
{
struct html_node *html;
GNode *n = NULL;
struct html_tag key, *found;
gchar t;
- gint taglen = strlen (text);
if (text == NULL || *text == '\0') {
return NULL;
@@ -483,7 +482,7 @@ construct_html_node (memory_pool_t * pool, gchar *text)
html = memory_pool_alloc0 (pool, sizeof (struct html_node));
/* Check whether this tag is fully closed */
- if (*(text + taglen - 1) == '/') {
+ if (*(text + tag_len - 1) == '/') {
html->flags |= FL_CLOSED;
}
@@ -660,41 +659,69 @@ decode_entitles (gchar *s, guint * len)
* Find the first occurrence of find in s, ignore case.
*/
static gchar *
-html_strcasestr (const gchar *s, const gchar *find)
+html_strncasestr (const gchar *s, const gchar *find, gsize len)
{
gchar c, sc;
- size_t len;
+ size_t mlen;
if ((c = *find++) != 0) {
c = g_ascii_tolower (c);
- len = strlen (find);
+ mlen = strlen (find);
do {
do {
- if ((sc = *s++) == 0)
+ if ((sc = *s++) == 0 || len -- == 0)
return (NULL);
} while (g_ascii_tolower (sc) != c);
- } while (g_ascii_strncasecmp (s, find, len) != 0);
+ } while (g_ascii_strncasecmp (s, find, mlen) != 0);
s--;
}
return ((gchar *)s);
}
static void
-parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text)
+check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url_text)
{
- gchar *c = NULL, *p;
+ struct uri *new;
+ gchar *url_str;
+ gsize len;
+ gint off, rc;
+
+ len = strcspn (url_text, "<>");
+
+ if (url_try_text (task->task_pool, url_text, len, &off, &url_str)) {
+ new = memory_pool_alloc0 (task->task_pool, sizeof (struct uri));
+ if (new != NULL) {
+ g_strstrip (url_str);
+ rc = parse_uri (new, url_str, task->task_pool);
+ if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+ if (g_ascii_strncasecmp (href_url->host, new->host,
+ MAX (href_url->hostlen, new->hostlen)) != 0) {
+ href_url->is_phished = TRUE;
+ }
+ }
+ else {
+ msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
+ }
+ }
+ }
+
+}
+
+static void
+parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, gchar *tag_text, gsize tag_len)
+{
+ gchar *c = NULL, *p, *url_text;
gint len, rc;
- gchar *url_text;
struct uri *url;
gboolean got_single_quote = FALSE, got_double_quote = FALSE;
/* For A tags search for href= and for IMG tags search for src= */
if (id == Tag_A) {
- c = html_strcasestr (tag_text, "href=");
+ c = html_strncasestr (tag_text, "href=", tag_len);
len = sizeof ("href=") - 1;
}
else if (id == Tag_IMG) {
- c = html_strcasestr (tag_text, "src=");
+ c = html_strncasestr (tag_text, "src=", tag_len);
len = sizeof ("src=") - 1;
}
@@ -707,7 +734,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
}
len = 0;
p = c;
- while (*p) {
+ while (*p && p - tag_text < tag_len) {
if (got_double_quote) {
if (*p == '"') {
break;
@@ -753,7 +780,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
g_strlcpy (url_text, c, len + 1);
decode_entitles (url_text, NULL);
- if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0) {
+ if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 &&
+ g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0) {
return;
}
@@ -761,6 +790,12 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
rc = parse_uri (url, url_text, task->task_pool);
if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) {
+ /*
+ * Check for phishing
+ */
+ if ((p = strchr (c, '>')) != NULL ) {
+ check_phishing (task, url, p + 1);
+ }
if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
g_tree_insert (part->html_urls, url_text, url);
task->urls = g_list_prepend (task->urls, url);
@@ -770,7 +805,8 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
}
gboolean
-add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part, gchar *tag_text, GNode ** cur_level)
+add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_part *part,
+ gchar *tag_text, gsize tag_len, GNode ** cur_level)
{
GNode *new;
struct html_node *data;
@@ -795,17 +831,17 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
part->html_nodes = new;
memory_pool_add_destructor (pool, (pool_destruct_func) g_node_destroy, part->html_nodes);
/* Call once again with root node */
- return add_html_node (task, pool, part, tag_text, cur_level);
+ return add_html_node (task, pool, part, tag_text, tag_len, cur_level);
}
else {
- new = construct_html_node (pool, tag_text);
+ new = construct_html_node (pool, tag_text, tag_len);
if (new == NULL) {
debug_task ("cannot construct HTML node for text '%s'", tag_text);
return -1;
}
data = new->data;
if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
- parse_tag_url (task, part, data->tag->id, tag_text);
+ parse_tag_url (task, part, data->tag->id, tag_text, tag_len);
}
if (data->flags & FL_CLOSING) {
if (!*cur_level) {
diff --git a/src/html.h b/src/html.h
index 048598f37..075818392 100644
--- a/src/html.h
+++ b/src/html.h
@@ -207,7 +207,8 @@ struct html_node {
/* Forwarded declaration */
struct worker_task;
-gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, gchar *tag_text, GNode **cur_level);
+gboolean add_html_node (struct worker_task *task, memory_pool_t *pool,
+ struct mime_text_part *part, gchar *tag_text, gsize tag_len, GNode **cur_level);
struct html_tag * get_tag_by_name (const gchar *name);
void decode_entitles (gchar *s, guint *len);
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index 38111ea38..9c4cef113 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -228,6 +228,7 @@ init_lua (struct config_file *cfg)
(void)luaopen_task (L);
(void)luaopen_textpart (L);
(void)luaopen_image (L);
+ (void)luaopen_url (L);
(void)luaopen_message (L);
(void)luaopen_classifier (L);
(void)luaopen_statfile (L);
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index ede42562a..6c51368fb 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -30,6 +30,7 @@ gint luaopen_hash_table (lua_State *L);
gint luaopen_trie (lua_State * L);
gint luaopen_textpart (lua_State *L);
gint luaopen_image (lua_State *L);
+gint luaopen_url (lua_State *L);
gint luaopen_classifier (lua_State *L);
gint luaopen_statfile (lua_State * L);
void init_lua (struct config_file *cfg);
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 2b19c45be..7102940aa 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -122,6 +122,23 @@ static const struct luaL_reg imagelib_m[] = {
{NULL, NULL}
};
+/* URL methods */
+LUA_FUNCTION_DEF (url, get_host);
+LUA_FUNCTION_DEF (url, get_user);
+LUA_FUNCTION_DEF (url, get_path);
+LUA_FUNCTION_DEF (url, get_text);
+LUA_FUNCTION_DEF (url, is_phished);
+
+static const struct luaL_reg urllib_m[] = {
+ LUA_INTERFACE_DEF (url, get_host),
+ LUA_INTERFACE_DEF (url, get_user),
+ LUA_INTERFACE_DEF (url, get_path),
+ LUA_INTERFACE_DEF (url, get_text),
+ LUA_INTERFACE_DEF (url, is_phished),
+ {"__tostring", lua_class_tostring},
+ {NULL, NULL}
+};
+
/* Utility functions */
static struct worker_task *
lua_check_task (lua_State * L)
@@ -147,8 +164,16 @@ lua_check_image (lua_State * L)
return *((struct rspamd_image **)ud);
}
+static struct uri *
+lua_check_url (lua_State * L)
+{
+ void *ud = luaL_checkudata (L, 1, "rspamd{url}");
+ luaL_argcheck (L, ud != NULL, 1, "'url' expected");
+ return *((struct uri **)ud);
+}
+
/*** Task interface ***/
-static gint
+static int
lua_task_get_message (lua_State * L)
{
GMimeMessage **pmsg;
@@ -193,19 +218,24 @@ lua_task_get_urls (lua_State * L)
gint i = 1;
struct worker_task *task = lua_check_task (L);
GList *cur;
- struct uri *url;
+ struct uri **purl;
- if (task != NULL) {
- lua_newtable (L);
- cur = g_list_first (task->urls);
- while (cur) {
- url = cur->data;
- lua_pushstring (L, struri (url));
- lua_rawseti (L, -2, i++);
- cur = g_list_next (cur);
+ if (task) {
+ cur = task->urls;
+ if (cur != NULL) {
+ lua_newtable (L);
+ while (cur) {
+ purl = lua_newuserdata (L, sizeof (struct uri *));
+ lua_setclass (L, "rspamd{url}", -1);
+ *purl = cur->data;
+ lua_rawseti (L, -2, i++);
+ cur = g_list_next (cur);
+ }
+ return 1;
}
}
+ lua_pushnil (L);
return 1;
}
@@ -919,6 +949,81 @@ lua_image_get_filename (lua_State *L)
return 1;
}
+/* URL part */
+static gint
+lua_url_get_host (lua_State *L)
+{
+ struct uri *url = lua_check_url (L);
+
+ if (url != NULL) {
+ lua_pushlstring (L, url->host, url->hostlen);
+ }
+ else {
+ lua_pushnil (L);
+ }
+ return 1;
+}
+
+static gint
+lua_url_get_user (lua_State *L)
+{
+ struct uri *url = lua_check_url (L);
+
+ if (url != NULL) {
+ lua_pushlstring (L, url->user, url->userlen);
+ }
+ else {
+ lua_pushnil (L);
+ }
+
+ return 1;
+}
+
+static gint
+lua_url_get_path (lua_State *L)
+{
+ struct uri *url = lua_check_url (L);
+
+ if (url != NULL) {
+ lua_pushlstring (L, url->data, url->datalen);
+ }
+ else {
+ lua_pushnil (L);
+ }
+
+ return 1;
+}
+
+static gint
+lua_url_get_text (lua_State *L)
+{
+ struct uri *url = lua_check_url (L);
+
+ if (url != NULL) {
+ lua_pushstring (L, struri (url));
+ }
+ else {
+ lua_pushnil (L);
+ }
+
+ return 1;
+}
+
+static gint
+lua_url_is_phished (lua_State *L)
+{
+ struct uri *url = lua_check_url (L);
+
+ if (url != NULL) {
+ lua_pushboolean (L, url->is_phished);
+ }
+ else {
+ lua_pushnil (L);
+ }
+
+ return 1;
+}
+
/* Init part */
gint
luaopen_task (lua_State * L)
@@ -946,3 +1051,12 @@ luaopen_image (lua_State * L)
return 1;
}
+
+gint
+luaopen_url (lua_State * L)
+{
+ lua_newclass (L, "rspamd{url}", urllib_m);
+ luaL_openlib (L, "rspamd_url", null_reg, 0);
+
+ return 1;
+}
diff --git a/src/message.c b/src/message.c
index 845386b21..60072d45d 100644
--- a/src/message.c
+++ b/src/message.c
@@ -111,9 +111,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
case 1: /* HTML/XML */
lc = '>';
in_q = state = 0;
- *p = '\0';
- add_html_node (task, pool, part, tbegin, &level_ptr);
- *p = '>';
+ add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
break;
case 2: /* PHP */
diff --git a/src/plugins/lua/phishing.lua b/src/plugins/lua/phishing.lua
new file mode 100644
index 000000000..8021a0069
--- /dev/null
+++ b/src/plugins/lua/phishing.lua
@@ -0,0 +1,28 @@
+-- Phishing detection interface for selecting phished urls and inserting corresponding symbol
+--
+--
+local symbol = 'PHISHED_URL'
+
+function phishing_cb (task)
+ local urls = task:get_urls();
+
+ if urls then
+ for _,url in ipairs(urls) do
+ if url:is_phished() then
+ task:insert_result(symbol, 1, url:get_host())
+ end
+ end
+ end
+end
+
+
+local opts = rspamd_config:get_all_opt('phishing')
+if opts then
+ if opts['symbol'] then
+ symbol = opts['symbol']
+
+ -- Register symbol's callback
+ rspamd_config:register_symbol(symbol, 1.0, 'phishing_cb')
+ end
+ -- If no symbol defined, do not register this module
+end
diff --git a/src/url.c b/src/url.c
index 63c31095c..e801527bd 100644
--- a/src/url.c
+++ b/src/url.c
@@ -160,7 +160,7 @@ enum {
#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
-static const gchar *
+const gchar *
url_strerror (enum uri_errno err)
{
switch (err) {
@@ -1147,12 +1147,11 @@ url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match
void
url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
{
- struct url_matcher *matcher;
- gint rc, idx;
+ gint rc, off = 0;
gchar *url_str = NULL;
struct uri *new;
- const guint8 *p, *end, *pos;
- url_match_t m;
+ const guint8 *p, *end;
+
if (!part->orig->data || part->orig->len == 0) {
msg_warn ("got empty text part");
@@ -1169,37 +1168,61 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
end = p + part->content->len;
}
while (p < end) {
- if ((pos = rspamd_trie_lookup (url_scanner->patterns, p, end - p, &idx)) == NULL) {
- break;
- }
- else {
- matcher = &matchers[idx];
- m.pattern = matcher->pattern;
- m.prefix = matcher->prefix;
- if (matcher->start (p, end, pos, &m) && matcher->end (p, end, pos, &m)) {
- url_str = memory_pool_alloc (task->task_pool, m.m_len + 1);
- memcpy (url_str, m.m_begin, m.m_len);
- url_str[m.m_len] = '\0';
- if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
- new = memory_pool_alloc (pool, sizeof (struct uri));
- if (new != NULL) {
- g_strstrip (url_str);
- rc = parse_uri (new, url_str, pool);
- if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
- g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
- task->urls = g_list_prepend (task->urls, new);
- }
- else {
- msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
- }
+ if (url_try_text (pool, p, end - p, &off, &url_str)) {
+ if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+ new = memory_pool_alloc0 (pool, sizeof (struct uri));
+ if (new != NULL) {
+ g_strstrip (url_str);
+ rc = parse_uri (new, url_str, pool);
+ if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+ g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+ task->urls = g_list_prepend (task->urls, new);
+ }
+ else {
+ msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
}
}
}
- pos += strlen (matcher->pattern);
}
- p = pos;
+ else {
+ break;
+ }
+ p += off;
+ }
+ }
+}
+
+gboolean
+url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str)
+{
+ const gchar *end, *pos;
+ gint idx;
+ struct url_matcher *matcher;
+ url_match_t m;
+
+ end = begin + len;
+ if (url_init () == 0) {
+ if ((pos = rspamd_trie_lookup (url_scanner->patterns, begin, len, &idx)) == NULL) {
+ return FALSE;
+ }
+ else {
+ matcher = &matchers[idx];
+ m.pattern = matcher->pattern;
+ m.prefix = matcher->prefix;
+ if (matcher->start (begin, end, pos, &m) && matcher->end (begin, end, pos, &m)) {
+ *url_str = memory_pool_alloc (pool, m.m_len + 1);
+ memcpy (*url_str, m.m_begin, m.m_len);
+ (*url_str)[m.m_len] = '\0';
+
+ }
+ if (res) {
+ *res = strlen (matcher->pattern);
+ }
+ return TRUE;
}
}
+
+ return FALSE;
}
/*
diff --git a/src/url.h b/src/url.h
index de7e8fe85..105466aa5 100644
--- a/src/url.h
+++ b/src/url.h
@@ -41,8 +41,9 @@ struct uri {
guint fragmentlen;
/* Flags */
- guint ipv6; /* URI contains IPv6 host */
- guint form; /* URI originated from form */
+ gboolean ipv6; /* URI contains IPv6 host */
+ gboolean form; /* URI originated from form */
+ gboolean is_phished; /* URI maybe phishing */
};
enum uri_errno {
@@ -73,5 +74,7 @@ enum protocol {
void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
enum uri_errno parse_uri(struct uri *uri, gchar *uristring, memory_pool_t *pool);
+gboolean url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gint *res, gchar **url_str);
+const gchar* url_strerror (enum uri_errno err);
#endif