From d4fbe7db611f537e0fb117dff6b01aea43dd8fe0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 27 Jul 2015 14:48:46 +0100 Subject: [PATCH] Fix tokenizers and mmapped file. --- src/libstat/backends/backends.h | 2 +- src/libstat/backends/mmaped_file.c | 63 +++++++++++++++------------ src/libstat/stat_internal.h | 1 + src/libstat/stat_process.c | 3 +- src/libstat/tokenizers/osb.c | 67 +++++++++++++++++++++-------- src/libstat/tokenizers/tokenizers.h | 12 ++++-- 6 files changed, 96 insertions(+), 52 deletions(-) diff --git a/src/libstat/backends/backends.h b/src/libstat/backends/backends.h index c25ab648e..4ac59655c 100644 --- a/src/libstat/backends/backends.h +++ b/src/libstat/backends/backends.h @@ -99,7 +99,7 @@ struct rspamd_stat_backend { gpointer ctx); \ ucl_object_t * rspamd_##name##_get_stat (gpointer runtime, \ gpointer ctx); \ - void rspamd_##name##_load_tokenizer_config (gpointer runtime, \ + gpointer rspamd_##name##_load_tokenizer_config (gpointer runtime, \ gsize *len); \ void rspamd_##name##_close (gpointer ctx) diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c index fb3c4fd43..bbd106187 100644 --- a/src/libstat/backends/mmaped_file.c +++ b/src/libstat/backends/mmaped_file.c @@ -120,7 +120,8 @@ rspamd_mmaped_file_t * rspamd_mmaped_file_is_open ( rspamd_mmaped_file_t * rspamd_mmaped_file_open (rspamd_mmaped_file_ctx * pool, const gchar *filename, size_t size, struct rspamd_statfile_config *stcf); gint rspamd_mmaped_file_create (rspamd_mmaped_file_ctx * pool, - const gchar *filename, size_t size, struct rspamd_statfile_config *stcf); + const gchar *filename, size_t size, struct rspamd_statfile_config *stcf, + rspamd_mempool_t *mempool); double rspamd_mmaped_file_get_block (rspamd_mmaped_file_ctx * pool, @@ -452,7 +453,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx * pool, } /* Now create new file with required size */ - if (rspamd_mmaped_file_create (pool, filename, size, stcf) != 0) { + if (rspamd_mmaped_file_create (pool, filename, size, stcf, pool->pool) != 0) { msg_err ("cannot create new file"); g_free (backup); return NULL; @@ -543,8 +544,6 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx * pool, { struct stat st; rspamd_mmaped_file_t *new_file; - struct rspamd_stat_tokenizer *tokenizer; - struct stat_file_header *header; if ((new_file = rspamd_mmaped_file_is_open (pool, stcf)) != NULL) { return new_file; @@ -615,22 +614,7 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx * pool, rspamd_mmaped_file_preload (new_file); - /* Check tokenizer compatibility */ - header = new_file->map; g_assert (stcf->clcf != NULL); - g_assert (stcf->clcf->tokenizer != NULL); - tokenizer = rspamd_stat_get_tokenizer (stcf->clcf->tokenizer->name); - g_assert (tokenizer != NULL); - - if (!tokenizer->compatible_config (stcf->clcf->tokenizer, header->unused, - header->tokenizer_conf_len)) { - msg_err ("mmapped statfile %s is not compatible with the tokenizer " - "defined", new_file->filename); - munmap (new_file->map, st.st_size); - g_slice_free1 (sizeof (*new_file), new_file); - - return NULL; - } g_hash_table_insert (pool->files, stcf, new_file); @@ -664,7 +648,7 @@ rspamd_mmaped_file_close_file (rspamd_mmaped_file_ctx * pool, gint rspamd_mmaped_file_create (rspamd_mmaped_file_ctx * pool, const gchar *filename, - size_t size, struct rspamd_statfile_config *stcf) + size_t size, struct rspamd_statfile_config *stcf, rspamd_mempool_t *mempool) { struct stat_file_header header = { .magic = {'r', 's', 'd'}, @@ -722,7 +706,7 @@ rspamd_mmaped_file_create (rspamd_mmaped_file_ctx * pool, const gchar *filename, g_assert (stcf->clcf->tokenizer != NULL); tokenizer = rspamd_stat_get_tokenizer (stcf->clcf->tokenizer->name); g_assert (tokenizer != NULL); - tok_conf = tokenizer->get_config (stcf->clcf->tokenizer, &tok_conf_len); + tok_conf = tokenizer->get_config (mempool, stcf->clcf->tokenizer, &tok_conf_len); header.tokenizer_conf_len = tok_conf_len; g_assert (tok_conf_len < sizeof (header.unused) - sizeof (guint64)); memcpy (header.unused, tok_conf, tok_conf_len); @@ -819,20 +803,25 @@ rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg) clf = cur->data; curst = clf->statfiles; - while (curst) { - stf = curst->data; + if (clf->backend == NULL) { /* * By default, all statfiles are treated as mmaped files */ - if (stf->backend == NULL || - strcmp (stf->backend, MMAPED_BACKEND_TYPE) == 0) { + clf->backend = MMAPED_BACKEND_TYPE; + } + + if (strcmp (clf->backend, MMAPED_BACKEND_TYPE) == 0) { + while (curst) { + stf = curst->data; /* * Check configuration sanity */ filenameo = ucl_object_find_key (stf->opts, "filename"); + if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { filenameo = ucl_object_find_key (stf->opts, "path"); + if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { msg_err ("statfile %s has no filename defined", stf->symbol); curst = curst->next; @@ -843,6 +832,7 @@ rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg) filename = ucl_object_tostring (filenameo); sizeo = ucl_object_find_key (stf->opts, "size"); + if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) { msg_err ("statfile %s has no size defined", stf->symbol); curst = curst->next; @@ -854,9 +844,9 @@ rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg) rspamd_mmaped_file_open (new, filename, size, stf); ctx->statfiles ++; - } - curst = curst->next; + curst = curst->next; + } } cur = g_list_next (cur); @@ -927,7 +917,7 @@ rspamd_mmaped_file_runtime (struct rspamd_task *task, size = ucl_object_toint (sizeo); if (learn) { - rspamd_mmaped_file_create (ctx, filename, size, stcf); + rspamd_mmaped_file_create (ctx, filename, size, stcf, task->task_pool); } mf = rspamd_mmaped_file_open (ctx, filename, size, stcf); @@ -1095,3 +1085,20 @@ rspamd_mmaped_file_finalize_process (struct rspamd_task *task, gpointer runtime, gpointer ctx) { } + +gpointer +rspamd_mmaped_file_load_tokenizer_config (gpointer runtime, + gsize *len) +{ + rspamd_mmaped_file_t *mf = runtime; + struct stat_file_header *header; + + g_assert (mf != NULL); + header = mf->map; + + if (len) { + *len = header->tokenizer_conf_len; + } + + return header->unused; +} diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h index 64790cddc..e70fc298a 100644 --- a/src/libstat/stat_internal.h +++ b/src/libstat/stat_internal.h @@ -39,6 +39,7 @@ struct rspamd_tokenizer_runtime { GTree *tokens; const gchar *name; struct rspamd_stat_tokenizer *tokenizer; + struct rspamd_tokenizer_config *tkcf; gpointer config; gsize conf_len; }; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index d6350e0e0..3ec579049 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -162,12 +162,13 @@ rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf, return NULL; } - if (!tok->tokenizer->load_config (tok, conf, conf_len)) { + if (!tok->tokenizer->load_config (task->task_pool, tok, conf, conf_len)) { return NULL; } tok->config = conf; tok->conf_len = conf_len; + tok->tkcf = cf; tok->tokens = g_tree_new (token_node_compare_func); rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens); diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 5916adb85..40dec0d82 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -154,20 +154,29 @@ rspamd_tokenizer_osb_config_from_ucl (rspamd_mempool_t * pool, } gpointer -rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf, +rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, gsize *len) { struct rspamd_osb_tokenizer_config *osb_cf, *def; if (cf != NULL && cf->opts != NULL) { - osb_cf = rspamd_tokenizer_osb_config_from_ucl (NULL, cf->opts); + osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, cf->opts); } else { def = rspamd_tokenizer_osb_default_config (); - osb_cf = g_slice_alloc (sizeof (*osb_cf)); + osb_cf = rspamd_mempool_alloc (pool, sizeof (*osb_cf)); memcpy (osb_cf, def, sizeof (*osb_cf)); + /* Do not write sipkey to statfile */ + } + + if (osb_cf->ht == RSPAMD_OSB_HASH_SIPHASH) { + msg_info ("siphash key is not stored into statfiles, so you'd need to " + "keep it inside the configuration"); } + memset (osb_cf->sk, 0, sizeof (osb_cf->sk)); + if (len != NULL) { *len = sizeof (*osb_cf); } @@ -176,13 +185,14 @@ rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf, } gboolean -rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_config *cf, +rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len) { struct rspamd_osb_tokenizer_config *osb_cf, *test_cf; gboolean ret = FALSE; - test_cf = rspamd_tokenizer_osb_get_config (cf, NULL); + test_cf = rt->config; + g_assert (test_cf != NULL); if (len == sizeof (*osb_cf)) { osb_cf = ptr; @@ -193,7 +203,8 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_config *cf, else { if (osb_cf->version == DEFAULT_OSB_VERSION) { /* We can compare them directly now */ - ret = memcmp (osb_cf, test_cf, sizeof (*osb_cf)) == 0; + ret = (memcmp (osb_cf, test_cf, sizeof (*osb_cf) + - sizeof (osb_cf->sk))) == 0; } } } @@ -208,10 +219,9 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_config *cf, } gint -rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, +rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, rspamd_mempool_t * pool, GArray * input, - GTree * tree, gboolean is_utf, const gchar *prefix) { @@ -221,6 +231,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, guint64 *hashpipe, cur, seed; guint32 h1, h2; guint processed = 0, i, w, window_size; + GTree *tree = rt->tokens; g_assert (tree != NULL); @@ -228,13 +239,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, return FALSE; } - if (cf != NULL && cf->opts != NULL) { - osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, cf->opts); - } - else { - osb_cf = rspamd_tokenizer_osb_default_config (); - } - + osb_cf = rt->config; window_size = osb_cf->window_size; if (prefix) { @@ -334,6 +339,32 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, return TRUE; } -/* - * vi:ts=4 - */ + +gboolean +rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf; + + if (ptr == NULL) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); + } + else { + g_assert (len == sizeof (*osb_cf)); + osb_cf = ptr; + } + + rt->config = osb_cf; + rt->conf_len = sizeof (*osb_cf); + + return TRUE; +} + +gboolean +rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) +{ + struct rspamd_osb_tokenizer_config *osb_cf = rt->config; + + return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); +} diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index ed7ba4bcc..050f6d7b1 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -14,10 +14,12 @@ struct rspamd_tokenizer_runtime; /* Common tokenizer structure */ struct rspamd_stat_tokenizer { gchar *name; - gpointer (*get_config) (struct rspamd_tokenizer_config *cf, gsize *len); + gpointer (*get_config) (rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, gsize *len); gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len); - gboolean (*load_config) (struct rspamd_tokenizer_runtime *rt, + gboolean (*load_config) (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len); gboolean (*is_compat) (struct rspamd_tokenizer_runtime *rt); gint (*tokenize_func)(struct rspamd_tokenizer_runtime *rt, @@ -43,7 +45,8 @@ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, gboolean is_utf, const gchar *prefix); -gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf, +gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, gsize *len); gboolean @@ -51,7 +54,8 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len); gboolean -rspamd_tokenizer_osb_load_config (struct rspamd_tokenizer_runtime *rt, +rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len); gboolean -- 2.39.5