summaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-25 16:50:41 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-25 16:52:12 +0100
commitac8d5f138c74f089c6cc1eb9e3c5cbc514fa22c3 (patch)
treef3656459d3b8004ec67ebe4f0c6f308558d36167 /src/libstat
parent70571f2b5513db967f68194878e177c52e69a107 (diff)
downloadrspamd-ac8d5f138c74f089c6cc1eb9e3c5cbc514fa22c3.tar.gz
rspamd-ac8d5f138c74f089c6cc1eb9e3c5cbc514fa22c3.zip
[CritFix] Fix words decay one more time (affects long messages)
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/tokenizers/tokenizers.c12
1 files changed, 8 insertions, 4 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 6804dff8a..c8e8e44df 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -25,6 +25,7 @@
#include <unicode/uchar.h>
#include <unicode/uiter.h>
#include <unicode/ubrk.h>
+#include <math.h>
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
rspamd_stat_token_t * token,
@@ -181,7 +182,8 @@ rspamd_tokenize_check_limit (gboolean decay,
*hv = mum_hash_finish (*hv);
/* We assume that word is 6 symbols length in average */
- decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len);
+ decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10;
+ decay_prob = floor (decay_prob) / 10.0;
if (decay_prob >= 1.0) {
*prob = G_MAXUINT64;
@@ -251,7 +253,7 @@ rspamd_tokenize_text (const gchar *text, gsize len,
guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
guint64 hv = 0;
gboolean decay = FALSE;
- guint64 prob;
+ guint64 prob = 0;
static UBreakIterator* bi = NULL;
if (text == NULL) {
@@ -283,7 +285,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
continue;
}
- if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+ if (token.len > 0 &&
+ rspamd_tokenize_check_limit (decay, word_decay, res->len,
&hv, &prob, &token, pos - text, len)) {
if (!decay) {
decay = TRUE;
@@ -427,7 +430,8 @@ start_over:
}
}
- if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+ if (token.len > 0 &&
+ rspamd_tokenize_check_limit (decay, word_decay, res->len,
&hv, &prob, &token, p, len)) {
if (!decay) {
decay = TRUE;