]> source.dussan.org Git - rspamd.git/commitdiff
Implement skipping of signatures in text messages.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Jul 2015 16:33:31 +0000 (17:33 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Jul 2015 16:33:31 +0000 (17:33 +0100)
src/libmime/message.c
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/lua/lua_util.c

index 56fa85333861e74836406f0f8ad36b2c569c8a8d..2fcb4f7cd36567c7469770a9b9d618c91ca7bd0a 100644 (file)
@@ -1214,7 +1214,8 @@ rspamd_normalize_text_part (struct rspamd_task *task,
        /* Ugly workaround */
        tmp = rspamd_tokenize_text (part->content->data,
                        part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
-                       part->urls_offset, FALSE);
+                       part->urls_offset, FALSE,
+                       !(part->flags & RSPAMD_MIME_PART_FLAG_HTML));
 
        if (tmp) {
                for (i = 0; i < tmp->len; i ++) {
@@ -1415,7 +1416,8 @@ process_text_part (struct rspamd_task *task,
        detect_text_language (text_part);
        text_part->words = rspamd_tokenize_text (text_part->content->data,
                        text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
-                       text_part->urls_offset, FALSE);
+                       text_part->urls_offset, FALSE,
+                       !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML));
        rspamd_normalize_text_part (task, text_part);
 
        /* Calculate number of lines */
index c634944ffb35e5aeead35f5b606b7e04c9234090..5318ab1449ab994766e4618afe0bbdb8b58a0b5f 100644 (file)
@@ -333,7 +333,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
        }
 
        if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat);
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat,
+                               FALSE);
                if (words != NULL) {
                        tok->tokenizer->tokenize_func (cf,
                                        task->task_pool,
index 51ef9038dc80cb783034a6e541b8bd11202e1834..d06afa055f17998c79c1edc201f281c377684207 100644 (file)
@@ -32,7 +32,7 @@
 
 typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos,
                rspamd_fstring_t * token,
-               GList **exceptions, gboolean is_utf, gsize *rl);
+               GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
 
 const gchar t_delimiters[255] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
@@ -79,7 +79,7 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
 static gboolean
 rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
                gchar **cur, rspamd_fstring_t * token,
-               GList **exceptions, gboolean is_utf, gsize *rl)
+               GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
 {
        gsize remain, pos;
        guchar *p;
@@ -171,17 +171,19 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
 static gboolean
 rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
                gchar **cur, rspamd_fstring_t * token,
-               GList **exceptions, gboolean is_utf, gsize *rl)
+               GList **exceptions, gboolean is_utf, gsize *rl,
+               gboolean check_signature)
 {
-       gsize remain, pos;
-       gchar *p, *next_p;
+       gsize remain, pos, siglen = 0;
+       gchar *p, *next_p, *sig = NULL;
        gunichar uc;
        guint processed = 0;
        struct process_exception *ex = NULL;
        enum {
                skip_delimiters = 0,
                feed_token,
-               skip_exception
+               skip_exception,
+               process_signature
        } state = skip_delimiters;
 
        if (buf == NULL) {
@@ -227,10 +229,18 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
                                state = skip_exception;
                                continue;
                        }
-                       else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
-                               state = feed_token;
-                               token->begin = p;
-                               continue;
+                       else if (g_unichar_isgraph (uc)) {
+                               if (!g_unichar_ispunct (uc)) {
+                                       state = feed_token;
+                                       token->begin = p;
+                                       continue;
+                               }
+                               else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
+                                       sig = p;
+                                       siglen = remain;
+                                       state = process_signature;
+                                       continue;
+                               }
                        }
                        break;
                case feed_token:
@@ -247,6 +257,16 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
                        *exceptions = g_list_next (*exceptions);
                        goto set_token;
                        break;
+               case process_signature:
+                       if (*p == '\r' || *p == '\n') {
+                               msg_debug ("signature found: %*s", siglen, sig);
+                               return FALSE;
+                       }
+                       else if (*p != ' ' && *p != '-' && *p != '_') {
+                               state = skip_delimiters;
+                               continue;
+                       }
+                       break;
                }
 
                remain -= next_p - p;
@@ -269,7 +289,8 @@ set_token:
 
 GArray *
 rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList *exceptions, gboolean compat)
+               gsize min_len, GList *exceptions, gboolean compat,
+               gboolean check_signature)
 {
        rspamd_fstring_t token, buf;
        gchar *pos = NULL;
@@ -297,7 +318,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
 
        res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128);
 
-       while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
+       while (func (&buf, &pos, &token, &cur, is_utf, &l, check_signature)) {
                if (l == 0 || (min_len > 0 && l < min_len)) {
                        token.begin = pos;
                        continue;
index fb4b42a965efb733ac0fc34dde96386c7ceb755e..2c96b7cfff38e7aa2627050d43bba3b830b14c3d 100644 (file)
@@ -28,7 +28,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
 
 /* Tokenize text into array of words (rspamd_fstring_t type) */
 GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList *exceptions, gboolean compat);
+               gsize min_len, GList *exceptions, gboolean compat,
+               gboolean check_signature);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
index 9a670da04a020152b07a2983f9eb9b6225f37d9f..8d5686f7c1442f79cbc3f1d5fbfb0b93585ee2c1 100644 (file)
@@ -340,7 +340,7 @@ lua_util_tokenize_text (lua_State *L)
        struct process_exception *ex;
        GArray *res;
        rspamd_fstring_t *w;
-       gboolean compat = FALSE;
+       gboolean compat = FALSE, check_sig = FALSE;
 
        if (lua_type (L, 1) == LUA_TSTRING) {
                in = luaL_checklstring (L, 1, &len);
@@ -389,11 +389,16 @@ lua_util_tokenize_text (lua_State *L)
                compat = lua_toboolean (L, 3);
        }
 
+       if (lua_gettop (L) > 3 && lua_type (L, 4) == LUA_TBOOLEAN) {
+               check_sig = lua_toboolean (L, 4);
+       }
+
        if (exceptions) {
                exceptions = g_list_reverse (exceptions);
        }
 
-       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat);
+       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat,
+                       check_sig);
 
        if (res == NULL) {
                lua_pushnil (L);