From a28536ff4d1bc30392e185f48e61d3cf858ef7b2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 25 Jan 2011 21:31:52 +0300 Subject: [PATCH] Fixes in classifying for small messages. --- src/classifiers/bayes.c | 15 ++++++++++++--- src/classifiers/winnow.c | 15 ++++++++++++--- src/lua/lua_config.c | 2 +- src/tokenizers/tokenizers.c | 2 +- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index 64783e0b4..9ef2544b0 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -178,7 +178,10 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE; + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } if (nodes < minnodes) { return FALSE; } @@ -250,7 +253,10 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE; + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } if (nodes < minnodes) { msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes); *sum = 0; @@ -332,7 +338,10 @@ bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE; + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } if (nodes < minnodes) { return NULL; } diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index 24ee7821c..2e8b98423 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -213,7 +213,10 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE; + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } if (nodes < minnodes) { msg_info ("do not classify message as it has too few tokens: %d, while %d min", nodes, minnodes); return FALSE; @@ -305,7 +308,10 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE; + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } if (nodes < minnodes) { msg_info ("do not classify message as it has too few tokens: %d, while %d min", nodes, minnodes); return NULL; @@ -379,7 +385,10 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE; + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } if (nodes < minnodes) { msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes); if (sum != NULL) { diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c index b77c97bfc..3b81ce64e 100644 --- a/src/lua/lua_config.c +++ b/src/lua/lua_config.c @@ -91,7 +91,7 @@ LUA_FUNCTION_DEF (trie, search_task); static const struct luaL_reg trielib_m[] = { LUA_INTERFACE_DEF (trie, create), LUA_INTERFACE_DEF (trie, add_pattern), - LUA_INTERFACE_DEF (trie, search_task), + LUA_INTERFACE_DEF (trie, search_text), LUA_INTERFACE_DEF (trie, search_task), {"__tostring", lua_class_tostring}, {NULL, NULL} diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 5e3d39c50..1c1f6d9a8 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -51,7 +51,7 @@ const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, + 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, -- 2.39.5