summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-15 15:02:48 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-15 15:02:48 +0000
commitee40b9d1146420d00f1ccf356716dc2c5b87e318 (patch)
treeb010daa13cb8f27a846d782294c1f672e4b42612
parentf4d3c21d16cd1c71f7a4bb7772e77e768a7ab8d1 (diff)
downloadrspamd-ee40b9d1146420d00f1ccf356716dc2c5b87e318.tar.gz
rspamd-ee40b9d1146420d00f1ccf356716dc2c5b87e318.zip
[Feature] Skip stop words in statistics
-rw-r--r--src/libstat/classifiers/bayes.c2
-rw-r--r--src/libstat/stat_process.c6
-rw-r--r--src/libstat/tokenizers/osb.c28
-rw-r--r--src/libstat/tokenizers/tokenizers.h22
4 files changed, 35 insertions, 23 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 2e494e526..1898df4fe 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -175,7 +175,7 @@ bayes_classify_token (struct rspamd_classifier *ctx,
}
/* Probability for this token */
- if (total_count > ctx->cfg->min_token_hits) {
+ if (total_count >= ctx->cfg->min_token_hits) {
spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns));
ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
spam_prob = spam_freq / (spam_freq + ham_freq);
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 87c5c3190..e06bd1fe3 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -294,7 +294,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
lua_settop (L, 0);
st_ctx->tokenizer->tokenize_func (st_ctx,
- task->task_pool,
+ task,
ar,
TRUE,
"META:",
@@ -345,7 +345,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
part = g_ptr_array_index (task->text_parts, i);
if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
- st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
+ st_ctx->tokenizer->tokenize_func (st_ctx, task,
part->utf_words, IS_PART_UTF (part),
NULL, task->tokens);
}
@@ -362,7 +362,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
words = rspamd_tokenize_subject (task);
if (words != NULL) {
st_ctx->tokenizer->tokenize_func (st_ctx,
- task->task_pool,
+ task,
words,
TRUE,
"SUBJECT",
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 8784a6858..d68e3bc60 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -17,8 +17,10 @@
* OSB tokenizer
*/
+
#include "tokenizers.h"
#include "stat_internal.h"
+#include "libmime/lang_detection.h"
/* Size for features pipe */
#define DEFAULT_FEATURE_WINDOW_SIZE 5
@@ -259,11 +261,11 @@ struct token_pipe_entry {
gint
rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
- rspamd_mempool_t *pool,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result)
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result)
{
rspamd_token_t *new_tok = NULL;
rspamd_stat_token_t *token;
@@ -303,6 +305,14 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
token = &g_array_index (words, rspamd_stat_token_t, w);
token_flags = token->flags;
+ if (task->lang_det) {
+ if (rspamd_language_detector_is_stop_word (task->lang_det,
+ token->begin, token->len)) {
+ /* Skip it */
+ continue;
+ }
+ }
+
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
rspamd_ftok_t ftok;
@@ -327,7 +337,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
}
if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
- new_tok = rspamd_mempool_alloc0 (pool, token_size);
+ new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size);
new_tok->flags = token_flags;
new_tok->t1 = token;
new_tok->t2 = token;
@@ -339,7 +349,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
}
#define ADD_TOKEN do {\
- new_tok = rspamd_mempool_alloc0 (pool, token_size); \
+ new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); \
new_tok->flags = token_flags; \
new_tok->t1 = hashpipe[0].t; \
new_tok->t2 = hashpipe[i].t; \
@@ -375,7 +385,9 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
processed++;
for (i = 1; i < window_size; i++) {
- ADD_TOKEN;
+ if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) {
+ ADD_TOKEN;
+ }
}
}
}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index bfabde74f..668f08cdc 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -18,13 +18,13 @@ struct rspamd_stat_ctx;
struct rspamd_stat_tokenizer {
gchar *name;
gpointer (*get_config) (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_config *cf, gsize *len);
+ struct rspamd_tokenizer_config *cf, gsize *len);
gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
- rspamd_mempool_t *pool,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result);
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
};
enum rspamd_tokenize_type {
@@ -47,11 +47,11 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len,
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
- rspamd_mempool_t *pool,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result);
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,