diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-11-12 18:47:05 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-11-12 18:47:05 +0000 |
commit | b4367c0c90b7ee7a4db77c169becff7c958ba23f (patch) | |
tree | 044468c643dc47393f357d92a8f627f0b57efc4b | |
parent | 43394f8ce6a64df104f77aaa071e802d1fede4e5 (diff) | |
download | rspamd-b4367c0c90b7ee7a4db77c169becff7c958ba23f.tar.gz rspamd-b4367c0c90b7ee7a4db77c169becff7c958ba23f.zip |
Implement words decaying for text parts.
-rw-r--r-- | src/libmime/message.c | 8 | ||||
-rw-r--r-- | src/libmime/message.h | 1 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 4 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 67 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 4 | ||||
-rw-r--r-- | src/lua/lua_util.c | 4 |
6 files changed, 74 insertions, 14 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index 6074b8437..a007e61b1 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -980,9 +980,9 @@ rspamd_normalize_text_part (struct rspamd_task *task, /* Ugly workaround */ tmp = rspamd_tokenize_text (part->content->data, - part->content->len, IS_PART_UTF (part), task->cfg->min_word_len, + part->content->len, IS_PART_UTF (part), task->cfg, part->urls_offset, FALSE, - !(part->flags & RSPAMD_MIME_PART_FLAG_HTML)); + NULL); if (tmp) { for (i = 0; i < tmp->len; i ++) { @@ -1231,9 +1231,9 @@ process_text_part (struct rspamd_task *task, /* Post process part */ detect_text_language (text_part); text_part->words = rspamd_tokenize_text (text_part->content->data, - text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len, + text_part->content->len, IS_PART_UTF (text_part), task->cfg, text_part->urls_offset, FALSE, - !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML)); + &text_part->hash); rspamd_normalize_text_part (task, text_part); /* Calculate number of lines */ diff --git a/src/libmime/message.h b/src/libmime/message.h index e98001448..aea5c3750 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -48,6 +48,7 @@ struct mime_text_part { GArray *words; GArray *normalized_words; guint nlines; + guint64 hash; }; struct received_header { diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index b252d19c4..b19663893 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -224,8 +224,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat, - FALSE); + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat, + NULL); if (words != NULL) { tok->tokenizer->tokenize_func (tok, task->task_pool, diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index c86c286cc..c2e91aded 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -29,6 +29,7 @@ #include "rspamd.h" #include "tokenizers.h" #include "stat_internal.h" +#include "xxhash.h" typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, rspamd_ftok_t * token, @@ -289,8 +290,8 @@ set_token: GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList *exceptions, gboolean compat, - gboolean check_signature) + struct rspamd_config *cfg, GList *exceptions, gboolean compat, + guint64 *hash) { rspamd_ftok_t token, buf; const gchar *pos = NULL; @@ -298,6 +299,11 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, GArray *res; GList *cur = exceptions; token_get_function func; + guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; + guint64 hv = 0; + XXH64_state_t *st; + gboolean decay = FALSE; + guint64 prob; if (text == NULL) { return NULL; @@ -315,18 +321,71 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, func = rspamd_tokenizer_get_word; } - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), 128); + if (cfg != NULL) { + min_len = cfg->min_word_len; + max_len = cfg->max_word_len; + word_decay = cfg->words_decay; + initial_size = word_decay * 2; + } + + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); + st = XXH64_createState (); + XXH64_reset (st, 0); while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { - if (l == 0 || (min_len > 0 && l < min_len)) { + if (l == 0 || (min_len > 0 && l < min_len) || + (max_len > 0 && l > max_len)) { token.begin = pos; continue; } + if (!decay) { + XXH64_update (st, token.begin, token.len); + + /* Check for decay */ + if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) { + /* Start decay */ + gdouble decay_prob; + + decay = TRUE; + hv = XXH64_digest (st); + + /* We assume that word is 6 symbols length in average */ + decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0); + + if (decay_prob >= 1.0) { + prob = G_MAXUINT64; + } + else { + prob = decay_prob * G_MAXUINT64; + } + } + } + else { + /* Decaying probability */ + /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ + hv = 2862933555777941757ULL * hv + 3037000493ULL; + + if (hv > prob) { + token.begin = pos; + continue; + } + } + g_array_append_val (res, token); token.begin = pos; } + if (!decay) { + hv = XXH64_digest (st); + } + + if (hash) { + *hash = hv; + } + + XXH64_freeState (st); + return res; } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 4689d1cc6..f4c9a5ed3 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -35,8 +35,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_ftok_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList *exceptions, gboolean compat, - gboolean check_signature); + struct rspamd_config *cfg, GList *exceptions, gboolean compat, + guint64 *hash); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 48a07d9f2..f0c9369bf 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -456,8 +456,8 @@ lua_util_tokenize_text (lua_State *L) exceptions = g_list_reverse (exceptions); } - res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat, - check_sig); + res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat, + NULL); if (res == NULL) { lua_pushnil (L); |