]> source.dussan.org Git - rspamd.git/commitdiff
Implement words decaying for text parts.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
src/libmime/message.c
src/libmime/message.h
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/lua/lua_util.c

index 6074b84373b788c9a2d08e0d38f86058b174de39..a007e61b1a95614535d01476f32e725daef61782 100644 (file)
@@ -980,9 +980,9 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 
        /* Ugly workaround */
        tmp = rspamd_tokenize_text (part->content->data,
-                       part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
+                       part->content->len, IS_PART_UTF (part), task->cfg,
                        part->urls_offset, FALSE,
-                       !(part->flags & RSPAMD_MIME_PART_FLAG_HTML));
+                       NULL);
 
        if (tmp) {
                for (i = 0; i < tmp->len; i ++) {
@@ -1231,9 +1231,9 @@ process_text_part (struct rspamd_task *task,
        /* Post process part */
        detect_text_language (text_part);
        text_part->words = rspamd_tokenize_text (text_part->content->data,
-                       text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
+                       text_part->content->len, IS_PART_UTF (text_part), task->cfg,
                        text_part->urls_offset, FALSE,
-                       !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML));
+                       &text_part->hash);
        rspamd_normalize_text_part (task, text_part);
 
        /* Calculate number of lines */
index e980014481a4f6900e13ee404666bbb5f31f0082..aea5c3750c3903d8cdc3620e88edb8fa411acc4e 100644 (file)
@@ -48,6 +48,7 @@ struct mime_text_part {
        GArray *words;
        GArray *normalized_words;
        guint nlines;
+       guint64 hash;
 };
 
 struct received_header {
index b252d19c44a47495e1548221155ee23644d0dcd3..b19663893ac2e1a4f7afc67ce4926de6efef140c 100644 (file)
@@ -224,8 +224,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        }
 
        if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat,
-                               FALSE);
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat,
+                               NULL);
                if (words != NULL) {
                        tok->tokenizer->tokenize_func (tok,
                                        task->task_pool,
index c86c286cc8709bd9d3b0bef2443b8ac64e4b502a..c2e91aded8196c68ac2dfe5543424c8855edc70a 100644 (file)
@@ -29,6 +29,7 @@
 #include "rspamd.h"
 #include "tokenizers.h"
 #include "stat_internal.h"
+#include "xxhash.h"
 
 typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
                rspamd_ftok_t * token,
@@ -289,8 +290,8 @@ set_token:
 
 GArray *
 rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList *exceptions, gboolean compat,
-               gboolean check_signature)
+               struct rspamd_config *cfg, GList *exceptions, gboolean compat,
+               guint64 *hash)
 {
        rspamd_ftok_t token, buf;
        const gchar *pos = NULL;
@@ -298,6 +299,11 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
        GArray *res;
        GList *cur = exceptions;
        token_get_function func;
+       guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+       guint64 hv = 0;
+       XXH64_state_t *st;
+       gboolean decay = FALSE;
+       guint64 prob;
 
        if (text == NULL) {
                return NULL;
@@ -315,18 +321,71 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                func = rspamd_tokenizer_get_word;
        }
 
-       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), 128);
+       if (cfg != NULL) {
+               min_len = cfg->min_word_len;
+               max_len = cfg->max_word_len;
+               word_decay = cfg->words_decay;
+               initial_size = word_decay * 2;
+       }
+
+       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
+       st = XXH64_createState ();
+       XXH64_reset (st, 0);
 
        while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
-               if (l == 0 || (min_len > 0 && l < min_len)) {
+               if (l == 0 || (min_len > 0 && l < min_len) ||
+                                       (max_len > 0 && l > max_len)) {
                        token.begin = pos;
                        continue;
                }
 
+               if (!decay) {
+                       XXH64_update (st, token.begin, token.len);
+
+                       /* Check for decay */
+                       if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
+                               /* Start decay */
+                               gdouble decay_prob;
+
+                               decay = TRUE;
+                               hv = XXH64_digest (st);
+
+                               /* We assume that word is 6 symbols length in average */
+                               decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+
+                               if (decay_prob >= 1.0) {
+                                       prob = G_MAXUINT64;
+                               }
+                               else {
+                                       prob = decay_prob * G_MAXUINT64;
+                               }
+                       }
+               }
+               else {
+                       /* Decaying probability */
+                       /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+                       hv = 2862933555777941757ULL * hv + 3037000493ULL;
+
+                       if (hv > prob) {
+                               token.begin = pos;
+                               continue;
+                       }
+               }
+
                g_array_append_val (res, token);
                token.begin = pos;
        }
 
+       if (!decay) {
+               hv = XXH64_digest (st);
+       }
+
+       if (hash) {
+               *hash = hv;
+       }
+
+       XXH64_freeState (st);
+
        return res;
 }
 
index 4689d1cc61ae0b8cafe421ef3918e236ba98f484..f4c9a5ed364ee3489b214ab711447fbdcee74127 100644 (file)
@@ -35,8 +35,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
 
 /* Tokenize text into array of words (rspamd_ftok_t type) */
 GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList *exceptions, gboolean compat,
-               gboolean check_signature);
+               struct rspamd_config *cfg, GList *exceptions, gboolean compat,
+               guint64 *hash);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
index 48a07d9f27b1eb728d64a9951f32ce9057d94b20..f0c9369bf9103dc81895769903f4365209a74ada 100644 (file)
@@ -456,8 +456,8 @@ lua_util_tokenize_text (lua_State *L)
                exceptions = g_list_reverse (exceptions);
        }
 
-       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat,
-                       check_sig);
+       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat,
+                       NULL);
 
        if (res == NULL) {
                lua_pushnil (L);