From ee40b9d1146420d00f1ccf356716dc2c5b87e318 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 15 Nov 2018 15:02:48 +0000 Subject: [PATCH] [Feature] Skip stop words in statistics --- src/libstat/classifiers/bayes.c | 2 +- src/libstat/stat_process.c | 6 +++--- src/libstat/tokenizers/osb.c | 28 ++++++++++++++++++++-------- src/libstat/tokenizers/tokenizers.h | 22 +++++++++++----------- 4 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 2e494e526..1898df4fe 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -175,7 +175,7 @@ bayes_classify_token (struct rspamd_classifier *ctx, } /* Probability for this token */ - if (total_count > ctx->cfg->min_token_hits) { + if (total_count >= ctx->cfg->min_token_hits) { spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns)); ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns)); spam_prob = spam_freq / (spam_freq + ham_freq); diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 87c5c3190..e06bd1fe3 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -294,7 +294,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, lua_settop (L, 0); st_ctx->tokenizer->tokenize_func (st_ctx, - task->task_pool, + task, ar, TRUE, "META:", @@ -345,7 +345,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, part = g_ptr_array_index (task->text_parts, i); if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { - st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, + st_ctx->tokenizer->tokenize_func (st_ctx, task, part->utf_words, IS_PART_UTF (part), NULL, task->tokens); } @@ -362,7 +362,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, words = rspamd_tokenize_subject (task); if (words != NULL) { st_ctx->tokenizer->tokenize_func (st_ctx, - task->task_pool, + task, words, TRUE, "SUBJECT", diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 8784a6858..d68e3bc60 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -17,8 +17,10 @@ * OSB tokenizer */ + #include "tokenizers.h" #include "stat_internal.h" +#include "libmime/lang_detection.h" /* Size for features pipe */ #define DEFAULT_FEATURE_WINDOW_SIZE 5 @@ -259,11 +261,11 @@ struct token_pipe_entry { gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, - rspamd_mempool_t *pool, - GArray *words, - gboolean is_utf, - const gchar *prefix, - GPtrArray *result) + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result) { rspamd_token_t *new_tok = NULL; rspamd_stat_token_t *token; @@ -303,6 +305,14 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, token = &g_array_index (words, rspamd_stat_token_t, w); token_flags = token->flags; + if (task->lang_det) { + if (rspamd_language_detector_is_stop_word (task->lang_det, + token->begin, token->len)) { + /* Skip it */ + continue; + } + } + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { rspamd_ftok_t ftok; @@ -327,7 +337,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, } if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { - new_tok = rspamd_mempool_alloc0 (pool, token_size); + new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); new_tok->flags = token_flags; new_tok->t1 = token; new_tok->t2 = token; @@ -339,7 +349,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, } #define ADD_TOKEN do {\ - new_tok = rspamd_mempool_alloc0 (pool, token_size); \ + new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); \ new_tok->flags = token_flags; \ new_tok->t1 = hashpipe[0].t; \ new_tok->t2 = hashpipe[i].t; \ @@ -375,7 +385,9 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, processed++; for (i = 1; i < window_size; i++) { - ADD_TOKEN; + if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) { + ADD_TOKEN; + } } } } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index bfabde74f..668f08cdc 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -18,13 +18,13 @@ struct rspamd_stat_ctx; struct rspamd_stat_tokenizer { gchar *name; gpointer (*get_config) (rspamd_mempool_t *pool, - struct rspamd_tokenizer_config *cf, gsize *len); + struct rspamd_tokenizer_config *cf, gsize *len); gint (*tokenize_func)(struct rspamd_stat_ctx *ctx, - rspamd_mempool_t *pool, - GArray *words, - gboolean is_utf, - const gchar *prefix, - GPtrArray *result); + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); }; enum rspamd_tokenize_type { @@ -47,11 +47,11 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len, /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, - rspamd_mempool_t *pool, - GArray *words, - gboolean is_utf, - const gchar *prefix, - GPtrArray *result); + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, -- 2.39.5