From 02b6117a397bb5cba27ca63a7e2df1c5dbfd0125 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 14 Jul 2015 17:33:31 +0100 Subject: [PATCH] Implement skipping of signatures in text messages. --- src/libmime/message.c | 6 ++-- src/libstat/stat_process.c | 3 +- src/libstat/tokenizers/tokenizers.c | 45 +++++++++++++++++++++-------- src/libstat/tokenizers/tokenizers.h | 3 +- src/lua/lua_util.c | 9 ++++-- 5 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 56fa85333..2fcb4f7cd 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1214,7 +1214,8 @@ rspamd_normalize_text_part (struct rspamd_task *task, /* Ugly workaround */ tmp = rspamd_tokenize_text (part->content->data, part->content->len, IS_PART_UTF (part), task->cfg->min_word_len, - part->urls_offset, FALSE); + part->urls_offset, FALSE, + !(part->flags & RSPAMD_MIME_PART_FLAG_HTML)); if (tmp) { for (i = 0; i < tmp->len; i ++) { @@ -1415,7 +1416,8 @@ process_text_part (struct rspamd_task *task, detect_text_language (text_part); text_part->words = rspamd_tokenize_text (text_part->content->data, text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len, - text_part->urls_offset, FALSE); + text_part->urls_offset, FALSE, + !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML)); rspamd_normalize_text_part (task, text_part); /* Calculate number of lines */ diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index c634944ff..5318ab144 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -333,7 +333,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat); + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat, + FALSE); if (words != NULL) { tok->tokenizer->tokenize_func (cf, task->task_pool, diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 51ef9038d..d06afa055 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -32,7 +32,7 @@ typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos, rspamd_fstring_t * token, - GList **exceptions, gboolean is_utf, gsize *rl); + GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, @@ -79,7 +79,7 @@ token_node_compare_func (gconstpointer a, gconstpointer b) static gboolean rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf, gchar **cur, rspamd_fstring_t * token, - GList **exceptions, gboolean is_utf, gsize *rl) + GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) { gsize remain, pos; guchar *p; @@ -171,17 +171,19 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf, static gboolean rspamd_tokenizer_get_word (rspamd_fstring_t * buf, gchar **cur, rspamd_fstring_t * token, - GList **exceptions, gboolean is_utf, gsize *rl) + GList **exceptions, gboolean is_utf, gsize *rl, + gboolean check_signature) { - gsize remain, pos; - gchar *p, *next_p; + gsize remain, pos, siglen = 0; + gchar *p, *next_p, *sig = NULL; gunichar uc; guint processed = 0; struct process_exception *ex = NULL; enum { skip_delimiters = 0, feed_token, - skip_exception + skip_exception, + process_signature } state = skip_delimiters; if (buf == NULL) { @@ -227,10 +229,18 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, state = skip_exception; continue; } - else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) { - state = feed_token; - token->begin = p; - continue; + else if (g_unichar_isgraph (uc)) { + if (!g_unichar_ispunct (uc)) { + state = feed_token; + token->begin = p; + continue; + } + else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) { + sig = p; + siglen = remain; + state = process_signature; + continue; + } } break; case feed_token: @@ -247,6 +257,16 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, *exceptions = g_list_next (*exceptions); goto set_token; break; + case process_signature: + if (*p == '\r' || *p == '\n') { + msg_debug ("signature found: %*s", siglen, sig); + return FALSE; + } + else if (*p != ' ' && *p != '-' && *p != '_') { + state = skip_delimiters; + continue; + } + break; } remain -= next_p - p; @@ -269,7 +289,8 @@ set_token: GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList *exceptions, gboolean compat) + gsize min_len, GList *exceptions, gboolean compat, + gboolean check_signature) { rspamd_fstring_t token, buf; gchar *pos = NULL; @@ -297,7 +318,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128); - while (func (&buf, &pos, &token, &cur, is_utf, &l)) { + while (func (&buf, &pos, &token, &cur, is_utf, &l, check_signature)) { if (l == 0 || (min_len > 0 && l < min_len)) { token.begin = pos; continue; diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index fb4b42a96..2c96b7cff 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -28,7 +28,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_fstring_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList *exceptions, gboolean compat); + gsize min_len, GList *exceptions, gboolean compat, + gboolean check_signature); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 9a670da04..8d5686f7c 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -340,7 +340,7 @@ lua_util_tokenize_text (lua_State *L) struct process_exception *ex; GArray *res; rspamd_fstring_t *w; - gboolean compat = FALSE; + gboolean compat = FALSE, check_sig = FALSE; if (lua_type (L, 1) == LUA_TSTRING) { in = luaL_checklstring (L, 1, &len); @@ -389,11 +389,16 @@ lua_util_tokenize_text (lua_State *L) compat = lua_toboolean (L, 3); } + if (lua_gettop (L) > 3 && lua_type (L, 4) == LUA_TBOOLEAN) { + check_sig = lua_toboolean (L, 4); + } + if (exceptions) { exceptions = g_list_reverse (exceptions); } - res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat); + res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat, + check_sig); if (res == NULL) { lua_pushnil (L); -- 2.39.5