diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-22 21:32:22 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-22 21:32:22 +0000 |
commit | ef9d816e983d6559e6c711a2a5e2e76a2ddbd226 (patch) | |
tree | dd88474496b8df6d950d51ac516693ed9f39b5fe /src | |
parent | 32d054c31888e6eb9f4e449e4c237142d4042b7f (diff) | |
download | rspamd-ef9d816e983d6559e6c711a2a5e2e76a2ddbd226.tar.gz rspamd-ef9d816e983d6559e6c711a2a5e2e76a2ddbd226.zip |
Allow configurable tokenizers.
Diffstat (limited to 'src')
-rw-r--r-- | src/libserver/cfg_file.h | 7 | ||||
-rw-r--r-- | src/libserver/cfg_rcl.c | 20 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 28 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 2 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 4 |
5 files changed, 44 insertions, 17 deletions
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index a0eb149df..3b6191306 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -141,6 +141,11 @@ struct rspamd_statfile_config { gpointer data; /**< opaque data */ }; +struct rspamd_tokenizer_config { + const ucl_object_t *opts; /**< other options */ + const gchar *name; /**< name of tokenizer */ +}; + /** * Classifier config definition */ @@ -149,7 +154,7 @@ struct rspamd_classifier_config { GHashTable *labels; /**< statfiles with labels */ gchar *metric; /**< metric of this classifier */ gchar *classifier; /**< classifier interface */ - gchar *tokenizer; /**< tokenizer used for classifier */ + struct rspamd_tokenizer_config *tokenizer; /**< tokenizer used for classifier */ ucl_object_t *opts; /**< other options */ GList *pre_callbacks; /**< list of callbacks that are called before classification */ GList *post_callbacks; /**< list of callbacks that are called after classification */ diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index eece86fb7..0ba9423a1 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -930,6 +930,7 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg, struct rspamd_classifier_config *ccf; gboolean res = TRUE; struct rspamd_rcl_section *stat_section; + struct rspamd_tokenizer_config *tkcf = NULL; ccf = rspamd_config_new_classifier (cfg, NULL); @@ -960,6 +961,19 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg, } } } + else if (g_ascii_strcasecmp (key, "tokenizer") == 0) { + tkcf = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*tkcf)); + if (ucl_object_type (val) == UCL_STRING) { + tkcf->name = ucl_object_tostring (val); + } + else if (ucl_object_type (val) == UCL_OBJECT) { + cur = ucl_object_find_key (val, "name"); + if (cur != NULL) { + tkcf->name = ucl_object_tostring (cur); + tkcf->opts = val; + } + } + } } } } @@ -968,6 +982,7 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg, } ccf->opts = (ucl_object_t *)obj; + ccf->tokenizer = tkcf; cfg->classifiers = g_list_prepend (cfg->classifiers, ccf); @@ -1357,11 +1372,6 @@ rspamd_rcl_config_init (void) G_STRUCT_OFFSET (struct rspamd_classifier_config, classifier), 0); rspamd_rcl_add_default_handler (sub, - "tokenizer", - rspamd_rcl_parse_struct_string, - G_STRUCT_OFFSET (struct rspamd_classifier_config, tokenizer), - 0); - rspamd_rcl_add_default_handler (sub, "min_tokens", rspamd_rcl_parse_struct_integer, G_STRUCT_OFFSET (struct rspamd_classifier_config, min_tokens), diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 511a9f800..eafbe2092 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -43,10 +43,19 @@ struct preprocess_cb_data { }; static struct rspamd_tokenizer_runtime * -rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool, +rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf, + rspamd_mempool_t *pool, struct rspamd_tokenizer_runtime **ls) { struct rspamd_tokenizer_runtime *tok = NULL, *cur; + const gchar *name; + + if (cf == NULL || cf->name == NULL) { + name = RSPAMD_DEFAULT_TOKENIZER; + } + else { + name = cf->name; + } LL_FOREACH (*ls, cur) { if (strcmp (cur->name, name) == 0) { @@ -270,7 +279,8 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, * Tokenize task using the tokenizer specified */ static void -rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, +rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, + struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok) { struct mime_text_part *part; @@ -287,7 +297,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, /* * XXX: Use normalized words if needed here */ - tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool, + tok->tokenizer->tokenize_func (cf, task->task_pool, part->words, tok->tokens, part->is_utf); } @@ -304,7 +314,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, if (sub != NULL) { words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); if (words != NULL) { - tok->tokenizer->tokenize_func (tok->tokenizer, + tok->tokenizer->tokenize_func (cf, task->task_pool, words, tok->tokens, @@ -349,11 +359,12 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) if (tok == NULL) { g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" - "for tokenizers", clcf->tokenizer); + "for tokenizers", clcf->tokenizer ? + clcf->tokenizer->name : "unknown"); return RSPAMD_STAT_PROCESS_ERROR; } - rspamd_stat_process_tokenize (st_ctx, task, tok); + rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok); cur = g_list_next (cur); } @@ -487,11 +498,12 @@ rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L, if (tok == NULL) { g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" - "for tokenizers", clcf->tokenizer); + "for tokenizers", clcf->tokenizer ? + clcf->tokenizer->name : "unknown"); return RSPAMD_STAT_PROCESS_ERROR; } - rspamd_stat_process_tokenize (st_ctx, task, tok); + rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok); cur = g_list_next (cur); } diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 0a8d01ce1..b51e909a9 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -38,7 +38,7 @@ extern const int primes[]; int -osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer, +osb_tokenize_text (struct rspamd_tokenizer_config *cf, rspamd_mempool_t * pool, GArray * input, GTree * tree, diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index d4c116e13..0bc594842 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -12,7 +12,7 @@ /* Common tokenizer structure */ struct rspamd_stat_tokenizer { gchar *name; - gint (*tokenize_func)(struct rspamd_stat_tokenizer *rspamd_stat_tokenizer, + gint (*tokenize_func)(struct rspamd_tokenizer_config *cf, rspamd_mempool_t *pool, GArray *words, GTree *result, @@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, gsize min_len, GList **exceptions); /* OSB tokenize function */ -int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer, +int osb_tokenize_text (struct rspamd_tokenizer_config *cf, rspamd_mempool_t *pool, GArray *input, GTree *tokens, |