aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-22 21:32:22 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-22 21:32:22 +0000
commitef9d816e983d6559e6c711a2a5e2e76a2ddbd226 (patch)
treedd88474496b8df6d950d51ac516693ed9f39b5fe /src
parent32d054c31888e6eb9f4e449e4c237142d4042b7f (diff)
downloadrspamd-ef9d816e983d6559e6c711a2a5e2e76a2ddbd226.tar.gz
rspamd-ef9d816e983d6559e6c711a2a5e2e76a2ddbd226.zip
Allow configurable tokenizers.
Diffstat (limited to 'src')
-rw-r--r--src/libserver/cfg_file.h7
-rw-r--r--src/libserver/cfg_rcl.c20
-rw-r--r--src/libstat/stat_process.c28
-rw-r--r--src/libstat/tokenizers/osb.c2
-rw-r--r--src/libstat/tokenizers/tokenizers.h4
5 files changed, 44 insertions, 17 deletions
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index a0eb149df..3b6191306 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -141,6 +141,11 @@ struct rspamd_statfile_config {
gpointer data; /**< opaque data */
};
+struct rspamd_tokenizer_config {
+ const ucl_object_t *opts; /**< other options */
+ const gchar *name; /**< name of tokenizer */
+};
+
/**
* Classifier config definition
*/
@@ -149,7 +154,7 @@ struct rspamd_classifier_config {
GHashTable *labels; /**< statfiles with labels */
gchar *metric; /**< metric of this classifier */
gchar *classifier; /**< classifier interface */
- gchar *tokenizer; /**< tokenizer used for classifier */
+ struct rspamd_tokenizer_config *tokenizer; /**< tokenizer used for classifier */
ucl_object_t *opts; /**< other options */
GList *pre_callbacks; /**< list of callbacks that are called before classification */
GList *post_callbacks; /**< list of callbacks that are called after classification */
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index eece86fb7..0ba9423a1 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -930,6 +930,7 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg,
struct rspamd_classifier_config *ccf;
gboolean res = TRUE;
struct rspamd_rcl_section *stat_section;
+ struct rspamd_tokenizer_config *tkcf = NULL;
ccf = rspamd_config_new_classifier (cfg, NULL);
@@ -960,6 +961,19 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg,
}
}
}
+ else if (g_ascii_strcasecmp (key, "tokenizer") == 0) {
+ tkcf = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*tkcf));
+ if (ucl_object_type (val) == UCL_STRING) {
+ tkcf->name = ucl_object_tostring (val);
+ }
+ else if (ucl_object_type (val) == UCL_OBJECT) {
+ cur = ucl_object_find_key (val, "name");
+ if (cur != NULL) {
+ tkcf->name = ucl_object_tostring (cur);
+ tkcf->opts = val;
+ }
+ }
+ }
}
}
}
@@ -968,6 +982,7 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg,
}
ccf->opts = (ucl_object_t *)obj;
+ ccf->tokenizer = tkcf;
cfg->classifiers = g_list_prepend (cfg->classifiers, ccf);
@@ -1357,11 +1372,6 @@ rspamd_rcl_config_init (void)
G_STRUCT_OFFSET (struct rspamd_classifier_config, classifier),
0);
rspamd_rcl_add_default_handler (sub,
- "tokenizer",
- rspamd_rcl_parse_struct_string,
- G_STRUCT_OFFSET (struct rspamd_classifier_config, tokenizer),
- 0);
- rspamd_rcl_add_default_handler (sub,
"min_tokens",
rspamd_rcl_parse_struct_integer,
G_STRUCT_OFFSET (struct rspamd_classifier_config, min_tokens),
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 511a9f800..eafbe2092 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -43,10 +43,19 @@ struct preprocess_cb_data {
};
static struct rspamd_tokenizer_runtime *
-rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool,
+rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf,
+ rspamd_mempool_t *pool,
struct rspamd_tokenizer_runtime **ls)
{
struct rspamd_tokenizer_runtime *tok = NULL, *cur;
+ const gchar *name;
+
+ if (cf == NULL || cf->name == NULL) {
+ name = RSPAMD_DEFAULT_TOKENIZER;
+ }
+ else {
+ name = cf->name;
+ }
LL_FOREACH (*ls, cur) {
if (strcmp (cur->name, name) == 0) {
@@ -270,7 +279,8 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
* Tokenize task using the tokenizer specified
*/
static void
-rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
+rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
+ struct rspamd_stat_ctx *st_ctx,
struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok)
{
struct mime_text_part *part;
@@ -287,7 +297,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
/*
* XXX: Use normalized words if needed here
*/
- tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool,
+ tok->tokenizer->tokenize_func (cf, task->task_pool,
part->words, tok->tokens, part->is_utf);
}
@@ -304,7 +314,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
if (sub != NULL) {
words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
if (words != NULL) {
- tok->tokenizer->tokenize_func (tok->tokenizer,
+ tok->tokenizer->tokenize_func (cf,
task->task_pool,
words,
tok->tokens,
@@ -349,11 +359,12 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
if (tok == NULL) {
g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined"
- "for tokenizers", clcf->tokenizer);
+ "for tokenizers", clcf->tokenizer ?
+ clcf->tokenizer->name : "unknown");
return RSPAMD_STAT_PROCESS_ERROR;
}
- rspamd_stat_process_tokenize (st_ctx, task, tok);
+ rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok);
cur = g_list_next (cur);
}
@@ -487,11 +498,12 @@ rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L,
if (tok == NULL) {
g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined"
- "for tokenizers", clcf->tokenizer);
+ "for tokenizers", clcf->tokenizer ?
+ clcf->tokenizer->name : "unknown");
return RSPAMD_STAT_PROCESS_ERROR;
}
- rspamd_stat_process_tokenize (st_ctx, task, tok);
+ rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok);
cur = g_list_next (cur);
}
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 0a8d01ce1..b51e909a9 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -38,7 +38,7 @@
extern const int primes[];
int
-osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
+osb_tokenize_text (struct rspamd_tokenizer_config *cf,
rspamd_mempool_t * pool,
GArray * input,
GTree * tree,
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index d4c116e13..0bc594842 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -12,7 +12,7 @@
/* Common tokenizer structure */
struct rspamd_stat_tokenizer {
gchar *name;
- gint (*tokenize_func)(struct rspamd_stat_tokenizer *rspamd_stat_tokenizer,
+ gint (*tokenize_func)(struct rspamd_tokenizer_config *cf,
rspamd_mempool_t *pool,
GArray *words,
GTree *result,
@@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
gsize min_len, GList **exceptions);
/* OSB tokenize function */
-int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
+int osb_tokenize_text (struct rspamd_tokenizer_config *cf,
rspamd_mempool_t *pool,
GArray *input,
GTree *tokens,