aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-14 17:33:31 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-14 17:33:31 +0100
commit02b6117a397bb5cba27ca63a7e2df1c5dbfd0125 (patch)
tree14bab1422a7d4eec4a6d2040b3d93f82f38c47f0 /src/libstat/tokenizers
parent828c31c52830e4a78da94d66c2ce8936380633e2 (diff)
downloadrspamd-02b6117a397bb5cba27ca63a7e2df1c5dbfd0125.tar.gz
rspamd-02b6117a397bb5cba27ca63a7e2df1c5dbfd0125.zip
Implement skipping of signatures in text messages.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c45
-rw-r--r--src/libstat/tokenizers/tokenizers.h3
2 files changed, 35 insertions, 13 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 51ef9038d..d06afa055 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -32,7 +32,7 @@
typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos,
rspamd_fstring_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl);
+ GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
const gchar t_delimiters[255] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
@@ -79,7 +79,7 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
static gboolean
rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
gchar **cur, rspamd_fstring_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl)
+ GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
{
gsize remain, pos;
guchar *p;
@@ -171,17 +171,19 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
static gboolean
rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
gchar **cur, rspamd_fstring_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl)
+ GList **exceptions, gboolean is_utf, gsize *rl,
+ gboolean check_signature)
{
- gsize remain, pos;
- gchar *p, *next_p;
+ gsize remain, pos, siglen = 0;
+ gchar *p, *next_p, *sig = NULL;
gunichar uc;
guint processed = 0;
struct process_exception *ex = NULL;
enum {
skip_delimiters = 0,
feed_token,
- skip_exception
+ skip_exception,
+ process_signature
} state = skip_delimiters;
if (buf == NULL) {
@@ -227,10 +229,18 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
state = skip_exception;
continue;
}
- else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
- state = feed_token;
- token->begin = p;
- continue;
+ else if (g_unichar_isgraph (uc)) {
+ if (!g_unichar_ispunct (uc)) {
+ state = feed_token;
+ token->begin = p;
+ continue;
+ }
+ else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
+ sig = p;
+ siglen = remain;
+ state = process_signature;
+ continue;
+ }
}
break;
case feed_token:
@@ -247,6 +257,16 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
*exceptions = g_list_next (*exceptions);
goto set_token;
break;
+ case process_signature:
+ if (*p == '\r' || *p == '\n') {
+ msg_debug ("signature found: %*s", siglen, sig);
+ return FALSE;
+ }
+ else if (*p != ' ' && *p != '-' && *p != '_') {
+ state = skip_delimiters;
+ continue;
+ }
+ break;
}
remain -= next_p - p;
@@ -269,7 +289,8 @@ set_token:
GArray *
rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList *exceptions, gboolean compat)
+ gsize min_len, GList *exceptions, gboolean compat,
+ gboolean check_signature)
{
rspamd_fstring_t token, buf;
gchar *pos = NULL;
@@ -297,7 +318,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128);
- while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
+ while (func (&buf, &pos, &token, &cur, is_utf, &l, check_signature)) {
if (l == 0 || (min_len > 0 && l < min_len)) {
token.begin = pos;
continue;
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index fb4b42a96..2c96b7cff 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -28,7 +28,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_fstring_t type) */
GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList *exceptions, gboolean compat);
+ gsize min_len, GList *exceptions, gboolean compat,
+ gboolean check_signature);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,