summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-27 13:28:15 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-27 13:28:15 +0100
commit043c5b7bcdb055c7f45034bd0a83408773c35bfd (patch)
tree45067ad27ec876a580a5f880fac448829149a22d /src
parentc111a765eb9cc6e8d362a427ab435db959415fd9 (diff)
downloadrspamd-043c5b7bcdb055c7f45034bd0a83408773c35bfd.tar.gz
rspamd-043c5b7bcdb055c7f45034bd0a83408773c35bfd.zip
More changes to tokenization.
Diffstat (limited to 'src')
-rw-r--r--src/libserver/cfg_file.h2
-rw-r--r--src/libserver/cfg_rcl.c10
-rw-r--r--src/libstat/stat_internal.h5
-rw-r--r--src/libstat/stat_process.c117
-rw-r--r--src/libstat/tokenizers/tokenizers.h6
5 files changed, 65 insertions, 75 deletions
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index e2c43845a..b14690632 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -104,7 +104,6 @@ struct rspamd_statfile_config {
gchar *label; /**< label of this statfile */
ucl_object_t *opts; /**< other options */
gboolean is_spam; /**< spam flag */
- const gchar *backend; /**< name of statfile's backend */
struct rspamd_classifier_config *clcf; /**< parent pointer of classifier configuration */
gpointer data; /**< opaque data */
};
@@ -123,6 +122,7 @@ struct rspamd_classifier_config {
gchar *metric; /**< metric of this classifier */
gchar *classifier; /**< classifier interface */
struct rspamd_tokenizer_config *tokenizer; /**< tokenizer used for classifier */
+ const gchar *backend; /**< name of statfile's backend */
ucl_object_t *opts; /**< other options */
GList *pre_callbacks; /**< list of callbacks that are called before classification */
GList *post_callbacks; /**< list of callbacks that are called after classification */
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index aba8b741b..13c5eb958 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1494,6 +1494,11 @@ rspamd_rcl_config_init (void)
rspamd_rcl_parse_struct_integer,
G_STRUCT_OFFSET (struct rspamd_classifier_config, max_tokens),
RSPAMD_CL_FLAG_INT_32);
+ rspamd_rcl_add_default_handler (sub,
+ "backend",
+ rspamd_rcl_parse_struct_string,
+ G_STRUCT_OFFSET (struct rspamd_classifier_config, backend),
+ 0);
/*
* Statfile defaults
@@ -1519,11 +1524,6 @@ rspamd_rcl_config_init (void)
rspamd_rcl_parse_struct_boolean,
G_STRUCT_OFFSET (struct rspamd_statfile_config, is_spam),
0);
- rspamd_rcl_add_default_handler (ssub,
- "backend",
- rspamd_rcl_parse_struct_string,
- G_STRUCT_OFFSET (struct rspamd_statfile_config, backend),
- 0);
/**
* Composites handler
diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h
index c7167f701..64790cddc 100644
--- a/src/libstat/stat_internal.h
+++ b/src/libstat/stat_internal.h
@@ -45,8 +45,6 @@ struct rspamd_tokenizer_runtime {
struct rspamd_statfile_runtime {
struct rspamd_statfile_config *st;
- struct rspamd_stat_backend *backend;
- struct rspamd_tokenizer_runtime *tok;
gpointer backend_runtime;
guint64 hits;
guint64 total_hits;
@@ -55,7 +53,8 @@ struct rspamd_statfile_runtime {
struct rspamd_classifier_runtime {
struct rspamd_classifier_config *clcf;
struct rspamd_stat_classifier *cl;
- GHashTable *tokenizers;
+ struct rspamd_stat_backend *backend;
+ struct rspamd_tokenizer_runtime *tok;
double ham_prob;
double spam_prob;
enum stat_process_stage stage;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index e60cfe1d4..e6d7c90c1 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -220,8 +220,8 @@ preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
res->cl_runtime = cl_runtime;
res->st_runtime = st_runtime;
- if (st_runtime->backend->process_token (cbdata->task, t, res,
- st_runtime->backend->ctx)) {
+ if (cl_runtime->backend->process_token (cbdata->task, t, res,
+ cl_runtime->backend->ctx)) {
if (cl_runtime->clcf->max_tokens > 0 &&
cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) {
@@ -237,6 +237,7 @@ preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
i ++;
curst = g_list_next (curst);
}
+
cur = g_list_next (cur);
}
@@ -244,26 +245,6 @@ preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
return FALSE;
}
-static gboolean
-rspamd_tokenizer_equal (gconstpointer a, gconstpointer b)
-{
- struct rspamd_tokenizer_runtime *ta = a, *tb = b;
-
- if (ta->conf_len == tb->conf_len) {
- return memcmp (ta->config, tb->config, ta->conf_len) == 0;
- }
-
- return FALSE;
-}
-
-static guint
-rspamd_tokenizer_hash (gconstpointer a)
-{
- struct rspamd_tokenizer_runtime *ta = a;
-
- return XXH64 (ta->config, ta->conf_len, 0xdeadbabe);
-}
-
static GList*
rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
struct rspamd_task *task,
@@ -274,11 +255,13 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
struct rspamd_classifier_runtime *cl_runtime;
struct rspamd_statfile_runtime *st_runtime;
struct rspamd_stat_backend *bk;
- gpointer backend_runtime;
+ gpointer backend_runtime, tok_config;
GList *cur, *st_list = NULL, *curst;
GList *cl_runtimes = NULL;
+ GHashTableIter it;
guint result_size = 0, start_pos = 0, end_pos = 0;
- struct rspamd_tokenizer_runtime *tok_runtime, srch_tok;
+ gsize conf_len;
+ struct rspamd_tokenizer_runtime *tok_runtime;
struct preprocess_cb_data cbdata;
cur = g_list_first (task->cfg->classifiers);
@@ -302,11 +285,6 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
/* Now init runtime values */
cl_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cl_runtime));
cl_runtime->cl = rspamd_stat_get_classifier (clcf->classifier);
- cl_runtime->tokenizers = g_hash_table_new (rspamd_tokenizer_hash,
- rspamd_tokenizer_equal);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_hash_table_destroy,
- cl_runtime->tokenizers);
if (cl_runtime->cl == NULL) {
g_set_error (err, rspamd_stat_quark(), 500,
@@ -316,6 +294,12 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
}
cl_runtime->clcf = clcf;
+ bk = rspamd_stat_get_backend (clcf->backend);
+ if (bk == NULL) {
+ msg_warn ("backend of type %s is not defined", clcf->backend);
+ cur = g_list_next (cur);
+ continue;
+ }
curst = st_list;
while (curst != NULL) {
@@ -327,14 +311,6 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
continue;
}
- bk = rspamd_stat_get_backend (stcf->backend);
-
- if (bk == NULL) {
- msg_warn ("backend of type %s is not defined", stcf->backend);
- curst = g_list_next (curst);
- continue;
- }
-
backend_runtime = bk->runtime (task, stcf, op != RSPAMD_CLASSIFY_OP,
bk->ctx);
@@ -350,29 +326,42 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
else {
/* Just skip this element */
msg_warn ("backend of type %s does not exist: %s",
- stcf->backend, stcf->symbol);
+ clcf->backend, stcf->symbol);
curst = g_list_next (curst);
continue;
}
}
- srch_tok.config = bk->load_tokenizer_config (backend_runtime,
- &srch_tok.conf_len);
+ tok_config = bk->load_tokenizer_config (backend_runtime,
+ &conf_len);
+
+ if (cl_runtime->tok == NULL) {
+ cl_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer,
+ st_ctx, task, cl_runtime, tok_config, conf_len);
+
+ if (cl_runtime->tok == NULL) {
+ g_set_error (err, rspamd_stat_quark(), 500,
+ "cannot initialize tokenizer for statfile %s", stcf->symbol);
+ g_list_free (cl_runtimes);
+
+ return NULL;
+ }
+ }
+ else {
+ if (!cl_runtime->tok->tokenizer->compatible_config (
+ cl_runtime->tok, tok_config, conf_len)) {
+ g_set_error (err, rspamd_stat_quark(), 500,
+ "incompatible tokenizer for statfile %s", stcf->symbol);
+ g_list_free (cl_runtimes);
+
+ return NULL;
+ }
+ }
st_runtime = rspamd_mempool_alloc0 (task->task_pool,
sizeof (*st_runtime));
st_runtime->st = stcf;
st_runtime->backend_runtime = backend_runtime;
- st_runtime->backend = bk;
- st_runtime->tok = g_hash_table_lookup (cl_runtime->tokenizers, &srch_tok);
-
- if (st_runtime->tok == NULL) {
- st_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer,
- st_ctx, task, cl_runtime, srch_tok.config,
- srch_tok.conf_len);
-
- g_assert (st_runtime->tok != NULL);
- }
if (stcf->is_spam) {
cl_runtime->total_spam += bk->total_learns (task, backend_runtime,
@@ -420,7 +409,7 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
cbdata.classifier_runtimes = cl_runtimes;
cbdata.task = task;
cbdata.tok = cl_runtime->tok;
- g_tree_foreach (cl_runtime->tok->tokens, preprocess_init_stat_token,
+ g_tree_foreach (cbdata.tok->tokens, preprocess_init_stat_token,
&cbdata);
}
@@ -522,9 +511,9 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
while (curst) {
st_run = curst->data;
- st_run->backend->finalize_process (task,
+ cl_run->backend->finalize_process (task,
st_run->backend_runtime,
- st_run->backend->ctx);
+ cl_run->backend->ctx);
curst = g_list_next (curst);
}
@@ -567,8 +556,8 @@ rspamd_stat_learn_token (gpointer k, gpointer v, gpointer d)
res = &g_array_index (t->results, struct rspamd_token_result, i);
st_runtime = (struct rspamd_statfile_runtime *)curst->data;
- if (st_runtime->backend->learn_token (cbdata->task, t, res,
- st_runtime->backend->ctx)) {
+ if (cl_runtime->backend->learn_token (cbdata->task, t, res,
+ cl_runtime->backend->ctx)) {
cl_runtime->processed_tokens ++;
if (cl_runtime->clcf->max_tokens > 0 &&
@@ -698,23 +687,23 @@ rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L,
st_run = (struct rspamd_statfile_runtime *)curst->data;
if (unlearn && spam != st_run->st->is_spam) {
- nrev = st_run->backend->dec_learns (task,
+ nrev = cl_run->backend->dec_learns (task,
st_run->backend_runtime,
- st_run->backend->ctx);
+ cl_run->backend->ctx);
msg_debug ("unlearned %s, new revision: %ul",
st_run->st->symbol, nrev);
}
else {
- nrev = st_run->backend->inc_learns (task,
+ nrev = cl_run->backend->inc_learns (task,
st_run->backend_runtime,
- st_run->backend->ctx);
+ cl_run->backend->ctx);
msg_debug ("learned %s, new revision: %ul",
st_run->st->symbol, nrev);
}
- st_run->backend->finalize_learn (task,
- st_run->backend_runtime,
- st_run->backend->ctx);
+ cl_run->backend->finalize_learn (task,
+ st_run->backend_runtime,
+ cl_run->backend->ctx);
curst = g_list_next (curst);
}
@@ -762,10 +751,10 @@ rspamd_stat_statistics (struct rspamd_config *cfg, guint64 *total_learns)
while (curst != NULL) {
stcf = (struct rspamd_statfile_config *)curst->data;
- bk = rspamd_stat_get_backend (stcf->backend);
+ bk = rspamd_stat_get_backend (clcf->backend);
if (bk == NULL) {
- msg_warn ("backend of type %s is not defined", stcf->backend);
+ msg_warn ("backend of type %s is not defined", clcf->backend);
curst = g_list_next (curst);
continue;
}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 1f4b0a54f..a2ff388ef 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -9,11 +9,13 @@
#define RSPAMD_DEFAULT_TOKENIZER "osb"
+struct rspamd_tokenizer_runtime;
+
/* Common tokenizer structure */
struct rspamd_stat_tokenizer {
gchar *name;
gpointer (*get_config) (struct rspamd_tokenizer_config *cf, gsize *len);
- gboolean (*compatible_config) (struct rspamd_tokenizer_config *cf,
+ gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt,
gpointer ptr, gsize len);
gboolean (*load_config) (struct rspamd_tokenizer_runtime *rt,
gpointer ptr, gsize len);
@@ -44,7 +46,7 @@ gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf,
gsize *len);
gboolean
-rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_config *cf,
+rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
gpointer ptr, gsize len);
gboolean