diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-14 17:33:31 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-14 17:33:31 +0100 |
commit | 02b6117a397bb5cba27ca63a7e2df1c5dbfd0125 (patch) | |
tree | 14bab1422a7d4eec4a6d2040b3d93f82f38c47f0 /src/libstat/tokenizers | |
parent | 828c31c52830e4a78da94d66c2ce8936380633e2 (diff) | |
download | rspamd-02b6117a397bb5cba27ca63a7e2df1c5dbfd0125.tar.gz rspamd-02b6117a397bb5cba27ca63a7e2df1c5dbfd0125.zip |
Implement skipping of signatures in text messages.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 45 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 3 |
2 files changed, 35 insertions, 13 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 51ef9038d..d06afa055 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -32,7 +32,7 @@ typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos, rspamd_fstring_t * token, - GList **exceptions, gboolean is_utf, gsize *rl); + GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, @@ -79,7 +79,7 @@ token_node_compare_func (gconstpointer a, gconstpointer b) static gboolean rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf, gchar **cur, rspamd_fstring_t * token, - GList **exceptions, gboolean is_utf, gsize *rl) + GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) { gsize remain, pos; guchar *p; @@ -171,17 +171,19 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf, static gboolean rspamd_tokenizer_get_word (rspamd_fstring_t * buf, gchar **cur, rspamd_fstring_t * token, - GList **exceptions, gboolean is_utf, gsize *rl) + GList **exceptions, gboolean is_utf, gsize *rl, + gboolean check_signature) { - gsize remain, pos; - gchar *p, *next_p; + gsize remain, pos, siglen = 0; + gchar *p, *next_p, *sig = NULL; gunichar uc; guint processed = 0; struct process_exception *ex = NULL; enum { skip_delimiters = 0, feed_token, - skip_exception + skip_exception, + process_signature } state = skip_delimiters; if (buf == NULL) { @@ -227,10 +229,18 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, state = skip_exception; continue; } - else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) { - state = feed_token; - token->begin = p; - continue; + else if (g_unichar_isgraph (uc)) { + if (!g_unichar_ispunct (uc)) { + state = feed_token; + token->begin = p; + continue; + } + else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) { + sig = p; + siglen = remain; + state = process_signature; + continue; + } } break; case feed_token: @@ -247,6 +257,16 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, *exceptions = g_list_next (*exceptions); goto set_token; break; + case process_signature: + if (*p == '\r' || *p == '\n') { + msg_debug ("signature found: %*s", siglen, sig); + return FALSE; + } + else if (*p != ' ' && *p != '-' && *p != '_') { + state = skip_delimiters; + continue; + } + break; } remain -= next_p - p; @@ -269,7 +289,8 @@ set_token: GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList *exceptions, gboolean compat) + gsize min_len, GList *exceptions, gboolean compat, + gboolean check_signature) { rspamd_fstring_t token, buf; gchar *pos = NULL; @@ -297,7 +318,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128); - while (func (&buf, &pos, &token, &cur, is_utf, &l)) { + while (func (&buf, &pos, &token, &cur, is_utf, &l, check_signature)) { if (l == 0 || (min_len > 0 && l < min_len)) { token.begin = pos; continue; diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index fb4b42a96..2c96b7cff 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -28,7 +28,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_fstring_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList *exceptions, gboolean compat); + gsize min_len, GList *exceptions, gboolean compat, + gboolean check_signature); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, |