diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/plugins/fuzzy_check.c | 35 | ||||
-rw-r--r-- | src/trie.c | 223 | ||||
-rw-r--r-- | src/trie.h | 64 | ||||
-rw-r--r-- | src/url.c | 504 |
4 files changed, 679 insertions, 147 deletions
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 4c78d33b7..edfc1caa8 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -118,6 +118,7 @@ struct fuzzy_learn_session { }; static struct fuzzy_ctx *fuzzy_module_ctx = NULL; +static const gchar hex_digits[] = "0123456789abcdef"; static int fuzzy_mime_filter (struct worker_task *task); static void fuzzy_symbol_callback (struct worker_task *task, void *unused); @@ -296,6 +297,27 @@ fuzzy_normalize (int32_t in, double weight) return (double)in; } +static const char * +fuzzy_to_string (fuzzy_hash_t *h) +{ + static char strbuf [FUZZY_HASHLEN * 2 + 1]; + int i; + guint8 byte; + + for (i = 0; i < FUZZY_HASHLEN; i ++) { + byte = h->hash_pipe[i]; + if (byte == '\0') { + break; + } + strbuf[i * 2] = hex_digits[byte >> 4]; + strbuf[i * 2 + 1] = hex_digits[byte & 0xf]; + } + + strbuf[i * 2] = '\0'; + + return strbuf; +} + int fuzzy_check_module_init (struct config_file *cfg, struct module_ctx **ctx) { @@ -463,8 +485,8 @@ fuzzy_io_callback (int fd, short what, void *arg) symbol = map->symbol; nval = fuzzy_normalize (value, map->weight); } - msg_info ("<%s>, found fuzzy hash with weight: %.2f, in list: %d", - session->task->message_id, flag, nval); + msg_info ("<%s>, found fuzzy hash '%s' with weight: %.2f, in list: %d", + session->task->message_id, fuzzy_to_string (session->h), flag, nval); rspamd_snprintf (buf, sizeof (buf), "%d: %d / %.2f", flag, value, nval); insert_result (session->task, symbol, nval, g_list_prepend (NULL, memory_pool_strdup (session->task->task_pool, buf))); @@ -527,7 +549,8 @@ fuzzy_learn_callback (int fd, short what, void *arg) goto err; } else if (buf[0] == 'O' && buf[1] == 'K') { - msg_info ("added fuzzy hash for message <%s>", session->task->message_id); + msg_info ("added fuzzy hash '%s' to list: %d for message <%s>", + fuzzy_to_string (session->h), session->flag, session->task->message_id); r = rspamd_snprintf (buf, sizeof (buf), "OK" CRLF); if (! rspamd_dispatcher_write (session->session->dispatcher, buf, r, FALSE, FALSE)) { return; @@ -823,7 +846,7 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) return; } - msg_info ("save hash of image: [%s]", checksum); + msg_info ("save hash of image: [%s] to list: %d", checksum, flag); g_free (checksum); } } @@ -852,9 +875,9 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) free_task (task, FALSE); return; } - msg_info ("save hash of part of type: %s/%s: [%s]", + msg_info ("save hash of part of type: %s/%s: [%s] to list %d", mime_part->type->type, mime_part->type->subtype, - checksum); + checksum, flag); g_free (checksum); } } diff --git a/src/trie.c b/src/trie.c new file mode 100644 index 000000000..945a2aa8e --- /dev/null +++ b/src/trie.c @@ -0,0 +1,223 @@ +/* Copyright (c) 2010, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * XXX: This code was derived from CamelTrie implementation (lgpl code) and + * is subject to be rewritten completely from scratch (or from bsd grep) + */ + +#include "config.h" +#include "mem_pool.h" +#include "trie.h" + + +rspamd_trie_t* +rspamd_trie_create (gboolean icase) +{ + rspamd_trie_t *new; + + new = g_malloc (sizeof (rspamd_trie_t)); + + new->icase = icase; + new->pool = memory_pool_new (memory_pool_get_size ()); + new->root.fail = NULL; + new->root.final = 0; + new->root.id = 0; + new->root.next = NULL; + new->root.match = NULL; + new->fail_states = g_ptr_array_sized_new (8); + + return new; +} + +/* + * Insert a single character as level of binary trie + */ +static struct rspamd_trie_state * +rspamd_trie_insert_char (rspamd_trie_t *trie, gint depth, struct rspamd_trie_state *q, gchar c) +{ + struct rspamd_trie_match *m; + + /* Insert new match into a chain */ + m = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_match)); + m->next = q->match; + m->c = c; + + q->match = m; + m->state = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_state)); + q = m->state; + q->match = NULL; + q->fail = &trie->root; + q->final = 0; + q->id = -1; + + if (trie->fail_states->len < depth + 1) { + /* Grow fail states array */ + guint size = trie->fail_states->len; + + size = MAX (size + 64, depth + 1); + g_ptr_array_set_size (trie->fail_states, size); + } + + q->next = trie->fail_states->pdata[depth]; + trie->fail_states->pdata[depth] = q; + + return q; +} + +G_INLINE_FUNC struct rspamd_trie_match * +check_match (struct rspamd_trie_state *s, gchar c) +{ + struct rspamd_trie_match *m = s->match; + + while (m && m->c != c) { + m = m->next; + } + + return m; +} + +void +rspamd_trie_insert (rspamd_trie_t *trie, const gchar *pattern, gint pattern_id) +{ + const guchar *p = pattern; + struct rspamd_trie_state *q, *q1, *r; + struct rspamd_trie_match *m, *n; + gint i, depth = 0; + gchar c; + + /* Insert pattern to the trie */ + + q = &trie->root; + + while (*p) { + c = trie->icase ? g_ascii_tolower (*p) : *p; + m = check_match (q, c); + if (m == NULL) { + /* Insert char at specified level depth */ + q = rspamd_trie_insert_char (trie, depth, q, c); + } + else { + /* Switch current state to matched state */ + q = m->state; + } + p ++; + depth ++; + } + + q->final = depth; + q->id = pattern_id; + + /* Update fail states and build fail states graph */ + /* Go throught the whole depth of prefixes */ + for (i = 0; i < trie->fail_states->len; i++) { + q = trie->fail_states->pdata[i]; + while (q) { + m = q->match; + while (m) { + c = m->c; + q1 = m->state; + r = q->fail; + /* Move q->fail to last known fail location for this character (or to NULL) */ + while (r && (n = check_match (r, c)) == NULL) { + r = r->fail; + } + + /* We have found new fail location for character c, so set it in q1 */ + if (r != NULL) { + q1->fail = n->state; + if (q1->fail->final > q1->final) { + q1->final = q1->fail->final; + } + } + else { + /* Search from root */ + if ((n = check_match (&trie->root, c))) { + q1->fail = n->state; + } + else { + q1->fail = &trie->root; + } + } + + m = m->next; + } + + q = q->next; + } + } +} + +const gchar* +rspamd_trie_lookup (rspamd_trie_t *trie, const gchar *buffer, gsize buflen, gint *matched_id) +{ + const guchar *p = buffer, *prev, *pat; + struct rspamd_trie_state *q; + struct rspamd_trie_match *m = NULL; + gchar c; + + + q = &trie->root; + prev = p; + pat = p; + + while (buflen) { + c = trie->icase ? g_ascii_tolower (*p) : *p; + + while (q != NULL && (m = check_match (q, c)) == NULL) { + q = q->fail; + } + + if (q == &trie->root) { + pat = prev; + } + + if (q == NULL) { + q = &trie->root; + pat = p; + } + else if (m != NULL) { + q = m->state; + + if (q->final) { + if (matched_id) { + *matched_id = q->id; + } + return (const gchar *) pat; + } + } + p ++; + prev = p; + buflen --; + } + + return NULL; +} + +void +rspamd_trie_free (rspamd_trie_t *trie) +{ + g_ptr_array_free (trie->fail_states, TRUE); + memory_pool_delete (trie->pool); + g_free (trie); +} diff --git a/src/trie.h b/src/trie.h new file mode 100644 index 000000000..f87116275 --- /dev/null +++ b/src/trie.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2010, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef TRIE_H_ +#define TRIE_H_ + +#include "config.h" +#include "mem_pool.h" + +/* + * Rspamd implements basic bitwise prefixed trie structure + */ + +struct rspamd_trie_match; + +struct rspamd_trie_state { + struct rspamd_trie_state *next; + struct rspamd_trie_state *fail; + struct rspamd_trie_match *match; + guint final; + gint id; +}; + +struct rspamd_trie_match { + struct rspamd_trie_match *next; + struct rspamd_trie_state *state; + gchar c; +}; + +typedef struct rspamd_trie_s { + struct rspamd_trie_state root; + GPtrArray *fail_states; + gboolean icase; + memory_pool_t *pool; +} rspamd_trie_t; + +rspamd_trie_t* rspamd_trie_create (gboolean icase); + +void rspamd_trie_insert (rspamd_trie_t *trie, const gchar *pattern, gint pattern_id); +const gchar* rspamd_trie_lookup (rspamd_trie_t *trie, const gchar *buffer, gsize buflen, gint *matched_id); +void rspamd_trie_free (rspamd_trie_t *trie); + +#endif /* TRIE_H_ */ @@ -28,6 +28,7 @@ #include "fstring.h" #include "main.h" #include "message.h" +#include "trie.h" #define POST_CHAR 1 #define POST_CHAR_S "\001" @@ -49,24 +50,55 @@ struct _proto { unsigned int need_ssl:1; }; -static const char *text_url = "((https?|ftp)://)?" - "(\\b(?<![.\\@A-Za-z0-9-])" "(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\." - "(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel" - "|cc|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se" - "|[a-z]{2}" "(?(1)|(?=/)))" "(?!\\w)" - "|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))" /* ip in dotted view */ - "|\\d{5,20}(?(1)|(?=[/:]))" /* ip in numeric view */ - ")" "(?::\\d{1,5})?" /* port */ - "(?!\\.\\w)" /* host part ended, no more of this further on */ - "(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?" /* path (&query) */ - "(?<![\\s>?!),.'\"\\]:])" "(?!@)" ")"; -static const char *html_url = "(?: src|href)=\"?(" "((https?|ftp)://)?" "(\\b(?<![.\\@A-Za-z0-9-])" "(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\." "(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel" "|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se" "|[a-z]{2}" "(?(1)|(?=/)))" "(?!\\w)" "|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))" ")" "(?::\\d{1,5})?" /* port */ - "(?!\\.\\w)" /* host part ended, no more of this further on */ - "(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?" /* path (&query) */ - "(?<![\\s>?!),.'\"\\]:])" "(?!@)" "))\"?"; - -static short url_initialized = 0; -GRegex *text_re, *html_re; +typedef struct url_match_s { + const gchar *m_begin; + gsize m_len; + const gchar *pattern; + const gchar *prefix; +} url_match_t; + +struct url_matcher { + const gchar *pattern; + const gchar *prefix; + gboolean (*start)(const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); + gboolean (*end)(const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); +}; + +static gboolean url_file_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); +static gboolean url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); + +static gboolean url_web_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); +static gboolean url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); + +static gboolean url_email_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); +static gboolean url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match); + +struct url_matcher matchers[] = { + { "file://", "", url_file_start, url_file_end }, + { "ftp://", "", url_web_start, url_web_end }, + { "sftp://", "", url_web_start, url_web_end }, + { "http://", "", url_web_start, url_web_end }, + { "https://", "", url_web_start, url_web_end }, + { "news://", "", url_web_start, url_web_end }, + { "nntp://", "", url_web_start, url_web_end }, + { "telnet://", "", url_web_start, url_web_end }, + { "webcal://", "", url_web_start, url_web_end }, + { "mailto://", "", url_email_start, url_email_end }, + { "callto://", "", url_web_start, url_web_end }, + { "h323:", "", url_web_start, url_web_end }, + { "sip:", "", url_web_start, url_web_end }, + { "www.", "http://", url_web_start, url_web_end }, + { "ftp.", "ftp://", url_web_start, url_web_end }, + { "@", "mailto://",url_email_start, url_email_end } +}; + +struct url_match_scanner { + struct url_matcher *matchers; + gsize matchers_count; + rspamd_trie_t *patterns; +}; + +struct url_match_scanner *url_scanner = NULL; static const struct _proto protocol_backends[] = { {"file", 0, NULL, 1, 0, 0, 0}, @@ -78,40 +110,6 @@ static const struct _proto protocol_backends[] = { {NULL, 0, NULL, 0, 0, 1, 0}, }; -/* - Table of "reserved" and "unsafe" characters. Those terms are - rfc1738-speak, as such largely obsoleted by rfc2396 and later - specs, but the general idea remains. - - A reserved character is the one that you can't decode without - changing the meaning of the URL. For example, you can't decode - "/foo/%2f/bar" into "/foo///bar" because the number and contents of - path components is different. Non-reserved characters can be - changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The - unsafe characters are loosely based on rfc1738, plus "$" and ",", - as recommended by rfc2396, and minus "~", which is very frequently - used (and sometimes unrecognized as %7E by broken servers). - - An unsafe character is the one that should be encoded when URLs are - placed in foreign environments. E.g. space and newline are unsafe - in HTTP contexts because HTTP uses them as separator and line - terminator, so they must be encoded to %20 and %0A respectively. - "*" is unsafe in shell context, etc. - - We determine whether a character is unsafe through static table - lookup. This code assumes ASCII character set and 8-bit chars. */ - -enum { - /* rfc1738 reserved chars + "$" and ",". */ - urlchr_reserved = 1, - - /* rfc1738 unsafe chars, plus non-printables. */ - urlchr_unsafe = 2 -}; - -#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask)) -#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved) -#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe) /* Convert an ASCII hex digit to the corresponding number between 0 and 15. H should be a hexadecimal digit that satisfies isxdigit; otherwise, the result is undefined. */ @@ -123,43 +121,44 @@ enum { #define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0) #define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0) -/* Shorthands for the table: */ -#define R urlchr_reserved -#define U urlchr_unsafe -#define RU R|U - -static const unsigned char urlchr_table[256] = { - U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ - U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ - U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ - U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ - U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */ - 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ - 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */ - RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ - 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ - 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ - 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */ - U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ - 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ - 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ - 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ - - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, - U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, +static guchar url_scanner_table[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128, + 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128, + 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; -#undef R -#undef U -#undef RU +enum { + IS_CTRL = (1 << 0), + IS_ALPHA = (1 << 1), + IS_DIGIT = (1 << 2), + IS_LWSP = (1 << 3), + IS_SPACE = (1 << 4), + IS_SPECIAL = (1 << 5), + IS_DOMAIN = (1 << 6), + IS_URLSAFE = (1 << 7) +}; + +#define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0) +#define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0) +#define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0) +#define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0) +#define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0) +#define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0) +#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) + static const char * url_strerror (enum uri_errno err) @@ -216,21 +215,15 @@ check_uri_file (unsigned char *name) static int url_init (void) { - GError *err = NULL; - if (url_initialized == 0) { - text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err); - if (err != NULL) { - msg_info ("cannot init text url parsing regexp: %s", err->message); - g_error_free (err); - return -1; + int i; + if (url_scanner == NULL) { + url_scanner = g_malloc (sizeof (struct url_match_scanner)); + url_scanner->matchers = matchers; + url_scanner->matchers_count = G_N_ELEMENTS (matchers); + url_scanner->patterns = rspamd_trie_create (TRUE); + for (i = 0; i < url_scanner->matchers_count; i ++) { + rspamd_trie_insert (url_scanner->patterns, matchers[i].pattern, i); } - html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err); - if (err != NULL) { - msg_info ("cannot init html url parsing regexp: %s", err->message); - g_error_free (err); - return -1; - } - url_initialized = 1; } return 0; @@ -398,15 +391,8 @@ url_strip (char *s) *t = '\0'; } -/* The core of url_escape_* functions. Escapes the characters that - match the provided mask in urlchr_table. - - If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars - will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a - freshly allocated string will be returned in all cases. */ - static char * -url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_pool_t * pool) +url_escape_1 (const char *s, int allow_passthrough, memory_pool_t * pool) { const char *p1; char *p2, *newstr; @@ -414,8 +400,9 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_p int addition = 0; for (p1 = s; *p1; p1++) - if (urlchr_test (*p1, mask)) + if (!is_urlsafe (*p1)) { addition += 2; /* Two more characters (hex digits) */ + } if (!addition) { if (allow_passthrough) { @@ -433,7 +420,7 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_p p2 = newstr; while (*p1) { /* Quote the characters that match the test mask. */ - if (urlchr_test (*p1, mask)) { + if (!is_urlsafe (*p1)) { unsigned char c = *p1++; *p2++ = '%'; *p2++ = XNUM_TO_DIGIT (c >> 4); @@ -453,7 +440,7 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_p char * url_escape (const char *s, memory_pool_t * pool) { - return url_escape_1 (s, urlchr_unsafe, 0, pool); + return url_escape_1 (s, 0, pool); } /* URL-escape the unsafe characters (see urlchr_table) in a given @@ -462,7 +449,7 @@ url_escape (const char *s, memory_pool_t * pool) static char * url_escape_allow_passthrough (const char *s, memory_pool_t * pool) { - return url_escape_1 (s, urlchr_unsafe, 1, pool); + return url_escape_1 (s, 1, pool); } /* Decide whether the char at position P needs to be encoded. (It is @@ -481,7 +468,7 @@ char_needs_escaping (const char *p) /* Garbled %.. sequence: encode `%'. */ return 1; } - else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) + else if (! is_urlsafe (*p)) return 1; else return 0; @@ -574,7 +561,7 @@ unescape_single_char (char *str, char chr) static char * url_escape_dir (const char *dir, memory_pool_t * pool) { - char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1, pool); + char *newdir = url_escape_1 (dir, 1, pool); if (newdir == dir) return (char *)dir; @@ -893,14 +880,252 @@ parse_uri (struct uri *uri, unsigned char *uristring, memory_pool_t * pool) return URI_ERRNO_OK; } +static const gchar url_braces[] = { + '(', ')' , + '{', '}' , + '[', ']' , + '<', '>' , + '|', '|' , + '\'', '\'' +}; + +static gboolean +is_open_brace (gchar c) +{ + if (c == '(' || + c == '{' || + c == '[' || + c == '<' || + c == '|' || + c == '\'') { + return TRUE; + } + + return FALSE; +} + +static gboolean +url_file_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match) +{ + match->m_begin = pos; + return TRUE; +} +static gboolean +url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match) +{ + const gchar *p; + gchar stop; + int i; + + p = pos + strlen (match->pattern); + if (*p == '/') { + p ++; + } + + for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) { + if (*p == url_braces[i]) { + stop = url_braces[i + 1]; + break; + } + } + + while (p < end && *p != stop && is_urlsafe (*p)) { + p ++; + } + + if (p == begin) { + return FALSE; + } + match->m_len = p - match->m_begin; + + return TRUE; + +} + + +static gboolean +url_web_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match) +{ + /* Check what we have found */ + if (pos > begin && *pos == 'w' && *(pos + 1) == 'w' && *(pos + 2) == 'w') { + if (!is_open_brace (*(pos - 1)) && !g_ascii_isspace (*(pos - 1))) { + return FALSE; + } + } + return TRUE; +} +static gboolean +url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match) +{ + const gchar *p, *c; + gchar open_brace = '\0', close_brace = '\0'; + int i, brace_stack; + gboolean passwd; + guint port; + + p = pos + strlen (match->pattern); + for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) { + if (*p == url_braces[i]) { + close_brace = url_braces[i + 1]; + open_brace = *p; + break; + } + } + + /* find the end of the domain */ + if (is_atom (*p)) { + /* might be a domain or user@domain */ + c = p; + while (p < end) { + if (!is_atom (*p)) { + break; + } + + p++; + + while (p < end && is_atom (*p)) { + p++; + } + + if ((p + 1) < end && *p == '.' && (is_atom (*(p + 1)) || *(p + 1) == '/')) { + p++; + } + } + + if (*p != '@') { + p = c; + } + else { + p++; + } + + goto domain; + } + else if (is_domain (*p)) { +domain: + while (p < end) { + if (!is_domain (*p)) { + break; + } + + p++; + + while (p < end && is_domain (*p)) { + p++; + } + + if ((p + 1) < end && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/')) { + p++; + } + } + } + else { + return FALSE; + } + + if (p < end) { + switch (*p) { + case ':': /* we either have a port or a password */ + p++; + + if (is_digit (*p) || passwd) { + port = (*p++ - '0'); + + while (p < end && is_digit (*p) && port < 65536) { + port = (port * 10) + (*p++ - '0'); + } + + if (!passwd && (port >= 65536 || *p == '@')) { + if (p < end) { + /* this must be a password? */ + goto passwd; + } + + p--; + } + } + else { + passwd: + passwd = TRUE; + c = p; + + while (p < end && is_atom (*p)) { + p++; + } + + if ((p + 2) < end) { + if (*p == '@') { + p++; + if (is_domain (*p)) { + goto domain; + } + } + + return FALSE; + } + } + + if (p >= end || *p != '/') { + break; + } + + /* we have a '/' so there could be a path - fall through */ + case '/': /* we've detected a path component to our url */ + p++; + case '?': + while (p < end && is_urlsafe (*p)) { + if (*p == open_brace) { + brace_stack++; + } + else if (*p == close_brace) { + brace_stack--; + if (brace_stack == -1) { + break; + } + } + p++; + } + + break; + default: + break; + } + } + + /* urls are extremely unlikely to end with any + * punctuation, so strip any trailing + * punctuation off. Also strip off any closing + * double-quotes. */ + while (p > pos && strchr (",.:;?!-|}])\"", p[-1])) { + p--; + } + + match->m_len = (p - pos); + + return TRUE; +} + + +static gboolean +url_email_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match) +{ + return FALSE; +} +static gboolean +url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match) +{ + return FALSE; +} + void url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html) { - GMatchInfo *info; - GError *err = NULL; - int rc; + struct url_matcher *matcher; + int rc, idx; char *url_str = NULL; struct uri *new; + const guint8 *p, *end, *pos; + url_match_t m; if (!part->orig->data || part->orig->len == 0) { msg_warn ("got empty text part"); @@ -909,27 +1134,33 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text if (url_init () == 0) { if (is_html) { - rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err); + p = part->orig->data; + end = p + part->orig->len; } else { - rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err); - + p = part->content->data; + end = p + part->content->len; } - if (rc) { - while (g_match_info_matches (info)) { - url_str = g_match_info_fetch (info, is_html ? 1 : 0); - debug_task ("extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off"); - if (url_str != NULL) { + while (p < end) { + if ((pos = rspamd_trie_lookup (url_scanner->patterns, p, end - p, &idx)) == NULL) { + break; + } + else { + matcher = &matchers[idx]; + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + if (matcher->start (p, pos, end, &m) && matcher->end (p, pos, end, &m)) { + url_str = memory_pool_alloc (task->task_pool, m.m_len + 1); + memcpy (url_str, m.m_begin, m.m_len); + url_str[m.m_len] = '\0'; if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { new = memory_pool_alloc (pool, sizeof (struct uri)); if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, pool); if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { - if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { - g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); - task->urls = g_list_prepend (task->urls, new); - } + g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); + task->urls = g_list_prepend (task->urls, new); } else { msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); @@ -937,19 +1168,10 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text } } } - memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_free, url_str); - /* Get next match */ - g_match_info_next (info, &err); + pos += strlen (matcher->pattern); } + p = pos; } - else if (err != NULL) { - debug_task ("error matching regexp: %s", err->message); - g_free (err); - } - else { - debug_task ("cannot find url pattern in given string"); - } - g_match_info_free (info); } } |