From 6eded20b2c15e524ed9a83c436a3b5f0bfbd253c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 26 Jul 2011 16:57:36 +0400 Subject: [PATCH] * Add max_tokens options to avoid classifying and learning with too much tokens from one message. Fix stupid memory leakage on client's timeout. --- src/buffer.c | 14 +++++------ src/classifiers/bayes.c | 54 +++++++++++++++++++++++++++++++++++------ src/main.c | 2 ++ src/worker.c | 5 +--- 4 files changed, 56 insertions(+), 19 deletions(-) diff --git a/src/buffer.c b/src/buffer.c index 8048dc13b..4f9ef304e 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -446,14 +446,16 @@ dispatcher_cb (gint fd, short what, void *arg) debug_ip("in dispatcher callback, what: %d, fd: %d", (gint)what, fd); - switch (what) { - case EV_TIMEOUT: + if ((what & EV_TIMEOUT) != 0) { if (d->err_callback) { err = g_error_new (G_DISPATCHER_ERROR, ETIMEDOUT, "IO timeout"); d->err_callback (err, d->user_data); } - break; - case EV_WRITE: + } + else if ((what & EV_READ) != 0) { + read_buffers (fd, d, FALSE); + } + else if ((what & EV_WRITE) != 0) { /* No data to write, disable further EV_WRITE to this fd */ if (d->in_sendfile) { sendfile_callback (d); @@ -475,10 +477,6 @@ dispatcher_cb (gint fd, short what, void *arg) write_buffers (fd, d, TRUE); } } - break; - case EV_READ: - read_buffers (fd, d, FALSE); - break; } } diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index 5d00505be..c006228b4 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -60,7 +60,8 @@ struct bayes_callback_data { stat_file_t *file; struct bayes_statfile_data *statfiles; guint32 statfiles_num; - guint64 learned_tokens; + guint64 learned_tokens; + gsize max_tokens; }; static gboolean @@ -92,6 +93,10 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) cd->learned_tokens ++; } + if (cd->max_tokens != 0 && cd->learned_tokens > cd->max_tokens) { + /* Stop learning on max tokens */ + return TRUE; + } return FALSE; } @@ -151,6 +156,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) } } + cd->learned_tokens ++; + if (cd->max_tokens != 0 && cd->learned_tokens > cd->max_tokens) { + /* Stop classifying on max tokens */ + return TRUE; + } + return FALSE; } @@ -171,7 +182,8 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, { struct bayes_callback_data data; gchar *value; - gint nodes, minnodes, i = 0, cnt, best_num = 0; + gint nodes, i = 0, cnt, best_num = 0; + gsize minnodes; guint64 rev, total_learns = 0; double best = 0; struct statfile *st; @@ -207,6 +219,15 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, data.now = time (NULL); data.ctx = ctx; + data.learned_tokens = 0; + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + minnodes = parse_limit (value); + data.max_tokens = minnodes; + } + else { + data.max_tokens = 0; + } + while (cur) { /* Select statfile to classify */ st = cur->data; @@ -264,8 +285,9 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb gboolean in_class, double *sum, double multiplier, GError **err) { struct bayes_callback_data data; - char *value; - int nodes, minnodes; + gchar *value; + gint nodes; + gsize minnodes; struct statfile *st, *sel_st = NULL; stat_file_t *to_learn; GList *cur; @@ -286,7 +308,7 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb bayes_error_quark(), /* error domain */ 1, /* error code */ "message contains too few tokens: %d, while min is %d", - nodes, minnodes); + nodes, (int)minnodes); return FALSE; } } @@ -296,6 +318,14 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb data.now = time (NULL); data.ctx = ctx; data.learned_tokens = 0; + data.learned_tokens = 0; + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + minnodes = parse_limit (value); + data.max_tokens = minnodes; + } + else { + data.max_tokens = 0; + } cur = ctx->cfg->statfiles; while (cur) { /* Select statfile to learn */ @@ -356,7 +386,8 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, { struct bayes_callback_data data; gchar *value; - gint nodes, minnodes; + gint nodes; + gsize minnodes; struct statfile *st; stat_file_t *file; GList *cur; @@ -375,7 +406,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, bayes_error_quark(), /* error domain */ 1, /* error code */ "message contains too few tokens: %d, while min is %d", - nodes, minnodes); + nodes, (int)minnodes); return FALSE; } } @@ -392,6 +423,15 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, data.now = time (NULL); data.ctx = ctx; + data.learned_tokens = 0; + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + minnodes = parse_limit (value); + data.max_tokens = minnodes; + } + else { + data.max_tokens = 0; + } + while (cur) { /* Select statfiles to learn */ st = cur->data; diff --git a/src/main.c b/src/main.c index e37e830c0..f043023a2 100644 --- a/src/main.c +++ b/src/main.c @@ -873,6 +873,8 @@ main (gint argc, gchar **argv, gchar **env) /* Init classifiers options */ register_classifier_opt ("bayes", "min_tokens"); register_classifier_opt ("winnow", "min_tokens"); + register_classifier_opt ("bayes", "max_tokens"); + register_classifier_opt ("winnow", "max_tokens"); register_classifier_opt ("winnow", "learn_threshold"); /* Pre-init of cache */ diff --git a/src/worker.c b/src/worker.c index e4dfdce3f..b919ad407 100644 --- a/src/worker.c +++ b/src/worker.c @@ -345,7 +345,6 @@ write_socket (void *arg) switch (task->state) { case WRITE_REPLY: if (!write_reply (task)) { - destroy_session (task->s); return FALSE; } if (ctx->is_custom) { @@ -401,9 +400,7 @@ err_socket (GError * err, void *arg) fin_custom_filters (task); } g_error_free (err); - if (task->state != WRITE_REPLY) { - destroy_session (task->s); - } + destroy_session (task->s); } /* -- 2.39.5