summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-01-25 21:31:52 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-01-25 21:31:52 +0300
commita28536ff4d1bc30392e185f48e61d3cf858ef7b2 (patch)
tree1543ef4c0c6f6790859132d8d316b3b83f2d01f8
parent76b69f300d8372969b6143e3e269376229d03edf (diff)
downloadrspamd-a28536ff4d1bc30392e185f48e61d3cf858ef7b2.tar.gz
rspamd-a28536ff4d1bc30392e185f48e61d3cf858ef7b2.zip
Fixes in classifying for small messages.
-rw-r--r--src/classifiers/bayes.c15
-rw-r--r--src/classifiers/winnow.c15
-rw-r--r--src/lua/lua_config.c2
-rw-r--r--src/tokenizers/tokenizers.c2
4 files changed, 26 insertions, 8 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index 64783e0b4..9ef2544b0 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -178,7 +178,10 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
if (nodes < minnodes) {
return FALSE;
}
@@ -250,7 +253,10 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
if (nodes < minnodes) {
msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes);
*sum = 0;
@@ -332,7 +338,10 @@ bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
if (nodes < minnodes) {
return NULL;
}
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index 24ee7821c..2e8b98423 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -213,7 +213,10 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
if (nodes < minnodes) {
msg_info ("do not classify message as it has too few tokens: %d, while %d min", nodes, minnodes);
return FALSE;
@@ -305,7 +308,10 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
if (nodes < minnodes) {
msg_info ("do not classify message as it has too few tokens: %d, while %d min", nodes, minnodes);
return NULL;
@@ -379,7 +385,10 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
if (nodes < minnodes) {
msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes);
if (sum != NULL) {
diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c
index b77c97bfc..3b81ce64e 100644
--- a/src/lua/lua_config.c
+++ b/src/lua/lua_config.c
@@ -91,7 +91,7 @@ LUA_FUNCTION_DEF (trie, search_task);
static const struct luaL_reg trielib_m[] = {
LUA_INTERFACE_DEF (trie, create),
LUA_INTERFACE_DEF (trie, add_pattern),
- LUA_INTERFACE_DEF (trie, search_task),
+ LUA_INTERFACE_DEF (trie, search_text),
LUA_INTERFACE_DEF (trie, search_task),
{"__tostring", lua_class_tostring},
{NULL, NULL}
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 5e3d39c50..1c1f6d9a8 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -51,7 +51,7 @@ const gchar t_delimiters[255] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
+ 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0,