aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-11-12 18:47:05 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-11-12 18:47:05 +0000
commitb4367c0c90b7ee7a4db77c169becff7c958ba23f (patch)
tree044468c643dc47393f357d92a8f627f0b57efc4b
parent43394f8ce6a64df104f77aaa071e802d1fede4e5 (diff)
downloadrspamd-b4367c0c90b7ee7a4db77c169becff7c958ba23f.tar.gz
rspamd-b4367c0c90b7ee7a4db77c169becff7c958ba23f.zip
Implement words decaying for text parts.
-rw-r--r--src/libmime/message.c8
-rw-r--r--src/libmime/message.h1
-rw-r--r--src/libstat/stat_process.c4
-rw-r--r--src/libstat/tokenizers/tokenizers.c67
-rw-r--r--src/libstat/tokenizers/tokenizers.h4
-rw-r--r--src/lua/lua_util.c4
6 files changed, 74 insertions, 14 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 6074b8437..a007e61b1 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -980,9 +980,9 @@ rspamd_normalize_text_part (struct rspamd_task *task,
/* Ugly workaround */
tmp = rspamd_tokenize_text (part->content->data,
- part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
+ part->content->len, IS_PART_UTF (part), task->cfg,
part->urls_offset, FALSE,
- !(part->flags & RSPAMD_MIME_PART_FLAG_HTML));
+ NULL);
if (tmp) {
for (i = 0; i < tmp->len; i ++) {
@@ -1231,9 +1231,9 @@ process_text_part (struct rspamd_task *task,
/* Post process part */
detect_text_language (text_part);
text_part->words = rspamd_tokenize_text (text_part->content->data,
- text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
+ text_part->content->len, IS_PART_UTF (text_part), task->cfg,
text_part->urls_offset, FALSE,
- !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML));
+ &text_part->hash);
rspamd_normalize_text_part (task, text_part);
/* Calculate number of lines */
diff --git a/src/libmime/message.h b/src/libmime/message.h
index e98001448..aea5c3750 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -48,6 +48,7 @@ struct mime_text_part {
GArray *words;
GArray *normalized_words;
guint nlines;
+ guint64 hash;
};
struct received_header {
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index b252d19c4..b19663893 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -224,8 +224,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat,
- FALSE);
+ words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat,
+ NULL);
if (words != NULL) {
tok->tokenizer->tokenize_func (tok,
task->task_pool,
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index c86c286cc..c2e91aded 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -29,6 +29,7 @@
#include "rspamd.h"
#include "tokenizers.h"
#include "stat_internal.h"
+#include "xxhash.h"
typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
rspamd_ftok_t * token,
@@ -289,8 +290,8 @@ set_token:
GArray *
rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList *exceptions, gboolean compat,
- gboolean check_signature)
+ struct rspamd_config *cfg, GList *exceptions, gboolean compat,
+ guint64 *hash)
{
rspamd_ftok_t token, buf;
const gchar *pos = NULL;
@@ -298,6 +299,11 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
GArray *res;
GList *cur = exceptions;
token_get_function func;
+ guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+ guint64 hv = 0;
+ XXH64_state_t *st;
+ gboolean decay = FALSE;
+ guint64 prob;
if (text == NULL) {
return NULL;
@@ -315,18 +321,71 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
func = rspamd_tokenizer_get_word;
}
- res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), 128);
+ if (cfg != NULL) {
+ min_len = cfg->min_word_len;
+ max_len = cfg->max_word_len;
+ word_decay = cfg->words_decay;
+ initial_size = word_decay * 2;
+ }
+
+ res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
+ st = XXH64_createState ();
+ XXH64_reset (st, 0);
while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
- if (l == 0 || (min_len > 0 && l < min_len)) {
+ if (l == 0 || (min_len > 0 && l < min_len) ||
+ (max_len > 0 && l > max_len)) {
token.begin = pos;
continue;
}
+ if (!decay) {
+ XXH64_update (st, token.begin, token.len);
+
+ /* Check for decay */
+ if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
+ /* Start decay */
+ gdouble decay_prob;
+
+ decay = TRUE;
+ hv = XXH64_digest (st);
+
+ /* We assume that word is 6 symbols length in average */
+ decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+
+ if (decay_prob >= 1.0) {
+ prob = G_MAXUINT64;
+ }
+ else {
+ prob = decay_prob * G_MAXUINT64;
+ }
+ }
+ }
+ else {
+ /* Decaying probability */
+ /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+ hv = 2862933555777941757ULL * hv + 3037000493ULL;
+
+ if (hv > prob) {
+ token.begin = pos;
+ continue;
+ }
+ }
+
g_array_append_val (res, token);
token.begin = pos;
}
+ if (!decay) {
+ hv = XXH64_digest (st);
+ }
+
+ if (hash) {
+ *hash = hv;
+ }
+
+ XXH64_freeState (st);
+
return res;
}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 4689d1cc6..f4c9a5ed3 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -35,8 +35,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_ftok_t type) */
GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList *exceptions, gboolean compat,
- gboolean check_signature);
+ struct rspamd_config *cfg, GList *exceptions, gboolean compat,
+ guint64 *hash);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 48a07d9f2..f0c9369bf 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -456,8 +456,8 @@ lua_util_tokenize_text (lua_State *L)
exceptions = g_list_reverse (exceptions);
}
- res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat,
- check_sig);
+ res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat,
+ NULL);
if (res == NULL) {
lua_pushnil (L);