Implement words decaying for text parts.

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 6074b84373b788c9a2d08e0d38f86058b174de39..a007e61b1a95614535d01476f32e725daef61782 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -980,9 +980,9 @@ rspamd_normalize_text_part (struct rspamd_task *task,
  
         /* Ugly workaround */
         tmp = rspamd_tokenize_text (part->content->data,
-                       part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
+                       part->content->len, IS_PART_UTF (part), task->cfg,
                         part->urls_offset, FALSE,
-                       !(part->flags & RSPAMD_MIME_PART_FLAG_HTML));
+                       NULL);
  
         if (tmp) {
                 for (i = 0; i < tmp->len; i ++) {
@@ -1231,9 +1231,9 @@ process_text_part (struct rspamd_task *task,
         /* Post process part */
         detect_text_language (text_part);
         text_part->words = rspamd_tokenize_text (text_part->content->data,
-                       text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
+                       text_part->content->len, IS_PART_UTF (text_part), task->cfg,
                         text_part->urls_offset, FALSE,
-                       !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML));
+                       &text_part->hash);
         rspamd_normalize_text_part (task, text_part);
  
         /* Calculate number of lines */
diff --git a/src/libmime/message.h b/src/libmime/message.h

index e980014481a4f6900e13ee404666bbb5f31f0082..aea5c3750c3903d8cdc3620e88edb8fa411acc4e 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -48,6 +48,7 @@ struct mime_text_part {
         GArray *words;
         GArray *normalized_words;
         guint nlines;
+       guint64 hash;
  };
  
  struct received_header {
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index b252d19c44a47495e1548221155ee23644d0dcd3..b19663893ac2e1a4f7afc67ce4926de6efef140c 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -224,8 +224,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
         }
  
         if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat,
-                               FALSE);
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat,
+                               NULL);
                 if (words != NULL) {
                         tok->tokenizer->tokenize_func (tok,
                                         task->task_pool,
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index c86c286cc8709bd9d3b0bef2443b8ac64e4b502a..c2e91aded8196c68ac2dfe5543424c8855edc70a 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -29,6 +29,7 @@
  #include "rspamd.h"
  #include "tokenizers.h"
  #include "stat_internal.h"
+#include "xxhash.h"
  
  typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
                 rspamd_ftok_t * token,
@@ -289,8 +290,8 @@ set_token:
  
  GArray *
  rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList *exceptions, gboolean compat,
-               gboolean check_signature)
+               struct rspamd_config *cfg, GList *exceptions, gboolean compat,
+               guint64 *hash)
  {
         rspamd_ftok_t token, buf;
         const gchar *pos = NULL;
@@ -298,6 +299,11 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
         GArray *res;
         GList *cur = exceptions;
         token_get_function func;
+       guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+       guint64 hv = 0;
+       XXH64_state_t *st;
+       gboolean decay = FALSE;
+       guint64 prob;
  
         if (text == NULL) {
                 return NULL;
@@ -315,18 +321,71 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                 func = rspamd_tokenizer_get_word;
         }
  
-       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), 128);
+       if (cfg != NULL) {
+               min_len = cfg->min_word_len;
+               max_len = cfg->max_word_len;
+               word_decay = cfg->words_decay;
+               initial_size = word_decay * 2;
+       }
+
+       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
+       st = XXH64_createState ();
+       XXH64_reset (st, 0);
  
         while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
-               if (l == 0 || (min_len > 0 && l < min_len)) {
+               if (l == 0 || (min_len > 0 && l < min_len) ||
+                                       (max_len > 0 && l > max_len)) {
                         token.begin = pos;
                         continue;
                 }
  
+               if (!decay) {
+                       XXH64_update (st, token.begin, token.len);
+
+                       /* Check for decay */
+                       if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
+                               /* Start decay */
+                               gdouble decay_prob;
+
+                               decay = TRUE;
+                               hv = XXH64_digest (st);
+
+                               /* We assume that word is 6 symbols length in average */
+                               decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+
+                               if (decay_prob >= 1.0) {
+                                       prob = G_MAXUINT64;
+                               }
+                               else {
+                                       prob = decay_prob * G_MAXUINT64;
+                               }
+                       }
+               }
+               else {
+                       /* Decaying probability */
+                       /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+                       hv = 2862933555777941757ULL * hv + 3037000493ULL;
+
+                       if (hv > prob) {
+                               token.begin = pos;
+                               continue;
+                       }
+               }
+
                 g_array_append_val (res, token);
                 token.begin = pos;
         }
  
+       if (!decay) {
+               hv = XXH64_digest (st);
+       }
+
+       if (hash) {
+               *hash = hv;
+       }
+
+       XXH64_freeState (st);
+
         return res;
  }
  
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h

index 4689d1cc61ae0b8cafe421ef3918e236ba98f484..f4c9a5ed364ee3489b214ab711447fbdcee74127 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -35,8 +35,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
  
  /* Tokenize text into array of words (rspamd_ftok_t type) */
  GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList *exceptions, gboolean compat,
-               gboolean check_signature);
+               struct rspamd_config *cfg, GList *exceptions, gboolean compat,
+               guint64 *hash);
  
  /* OSB tokenize function */
  gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c

index 48a07d9f27b1eb728d64a9951f32ce9057d94b20..f0c9369bf9103dc81895769903f4365209a74ada 100644 (file)
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -456,8 +456,8 @@ lua_util_tokenize_text (lua_State *L)
                 exceptions = g_list_reverse (exceptions);
         }
  
-       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat,
-                       check_sig);
+       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat,
+                       NULL);
  
         if (res == NULL) {
                 lua_pushnil (L);
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 12 Nov 2015 18:47:05 +0000 (18:47 +0000)
src/libmime/message.c		patch \| blob \| history
src/libmime/message.h		patch \| blob \| history
src/libstat/stat_process.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.h		patch \| blob \| history
src/lua/lua_util.c		patch \| blob \| history