diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-01-05 19:04:40 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-01-05 19:04:40 +0000 |
commit | 1622570f58b5f5b184f97cd75a52a98cc0b1721a (patch) | |
tree | 3510b622bcc91644234a9e9a25825d3f7c1b1de6 /src | |
parent | 57a464ab523700fc7f2ab3f116724cd198799da8 (diff) | |
parent | 29b7115762ad84865b6b657c8f5e88aba16e8eb4 (diff) | |
download | rspamd-1622570f58b5f5b184f97cd75a52a98cc0b1721a.tar.gz rspamd-1622570f58b5f5b184f97cd75a52a98cc0b1721a.zip |
Merge branch 'stat-rework'
Diffstat (limited to 'src')
-rw-r--r-- | src/libserver/task.h | 5 | ||||
-rw-r--r-- | src/libstat/CMakeLists.txt | 12 | ||||
-rw-r--r-- | src/libstat/backends/backends.h | 30 | ||||
-rw-r--r-- | src/libstat/backends/mmaped_file.c | 323 | ||||
-rw-r--r-- | src/libstat/backends/sqlite3_backend.c | 430 | ||||
-rw-r--r-- | src/libstat/classifiers/bayes.c | 313 | ||||
-rw-r--r-- | src/libstat/classifiers/classifiers.h | 51 | ||||
-rw-r--r-- | src/libstat/learn_cache/learn_cache.h | 6 | ||||
-rw-r--r-- | src/libstat/learn_cache/sqlite3_cache.c | 75 | ||||
-rw-r--r-- | src/libstat/stat_config.c | 155 | ||||
-rw-r--r-- | src/libstat/stat_internal.h | 69 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 512 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 173 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 35 |
14 files changed, 854 insertions, 1335 deletions
diff --git a/src/libserver/task.h b/src/libserver/task.h index 359b2f41f..ed18d99d0 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -149,8 +149,7 @@ struct rspamd_task { GHashTable *raw_headers; /**< list of raw headers */ GHashTable *results; /**< hash table of metric_result indexed by * metric's name */ - GHashTable *tokens; /**< hash table of tokens indexed by tokenizer - * pointer */ + GPtrArray *tokens; /**< statistics tokens */ InternetAddressList *rcpt_mime; /**< list of all recipients */ InternetAddressList *rcpt_envelope; /**< list of all recipients */ InternetAddressList *from_mime; @@ -158,7 +157,7 @@ struct rspamd_task { GList *messages; /**< list of messages that would be reported */ struct rspamd_re_runtime *re_rt; /**< regexp runtime */ - GList *cl_runtimes; /**< classifiers runtime */ + GPtrArray *stat_runtimes; /**< backend runtime */ struct rspamd_config *cfg; /**< pointer to config object */ GError *err; rspamd_mempool_t *task_pool; /**< memory pool for task */ diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt index 2fa16cf98..80abf9d6f 100644 --- a/src/libstat/CMakeLists.txt +++ b/src/libstat/CMakeLists.txt @@ -10,14 +10,14 @@ SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c) SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c) IF(ENABLE_HIREDIS MATCHES "ON") - SET(BACKENDSSRC ${BACKENDSSRC} - ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis.c) + #SET(BACKENDSSRC ${BACKENDSSRC} + # ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis.c) ENDIF(ENABLE_HIREDIS MATCHES "ON") SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c) - -SET(RSPAMD_STAT ${LIBSTATSRC} - ${TOKENIZERSSRC} - ${CLASSIFIERSSRC} + +SET(RSPAMD_STAT ${LIBSTATSRC} + ${TOKENIZERSSRC} + ${CLASSIFIERSSRC} ${BACKENDSSRC} ${CACHESSRC} PARENT_SCOPE) diff --git a/src/libstat/backends/backends.h b/src/libstat/backends/backends.h index 4ac59655c..44c2f13ed 100644 --- a/src/libstat/backends/backends.h +++ b/src/libstat/backends/backends.h @@ -36,21 +36,23 @@ struct rspamd_statfile_config; struct rspamd_config; struct rspamd_stat_ctx; struct rspamd_token_result; -struct rspamd_statfile_runtime; -struct token_node_s; +struct rspamd_statfile; struct rspamd_task; struct rspamd_stat_backend { const char *name; - gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg); + gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg, + struct rspamd_statfile *st); gpointer (*runtime)(struct rspamd_task *task, struct rspamd_statfile_config *stcf, gboolean learn, gpointer ctx); - gboolean (*process_token)(struct rspamd_task *task, struct token_node_s *tok, - struct rspamd_token_result *res, gpointer ctx); + gboolean (*process_tokens)(struct rspamd_task *task, GPtrArray *tokens, + gint id, + gpointer ctx); void (*finalize_process)(struct rspamd_task *task, gpointer runtime, gpointer ctx); - gboolean (*learn_token)(struct rspamd_task *task, struct token_node_s *tok, - struct rspamd_token_result *res, gpointer ctx); + gboolean (*learn_tokens)(struct rspamd_task *task, GPtrArray *tokens, + gint id, + gpointer ctx); gulong (*total_learns)(struct rspamd_task *task, gpointer runtime, gpointer ctx); void (*finalize_learn)(struct rspamd_task *task, @@ -67,20 +69,19 @@ struct rspamd_stat_backend { }; #define RSPAMD_STAT_BACKEND_DEF(name) \ - gpointer rspamd_##name##_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg); \ + gpointer rspamd_##name##_init (struct rspamd_stat_ctx *ctx, \ + struct rspamd_config *cfg, struct rspamd_statfile *st); \ gpointer rspamd_##name##_runtime (struct rspamd_task *task, \ struct rspamd_statfile_config *stcf, \ gboolean learn, gpointer ctx); \ - gboolean rspamd_##name##_process_token (struct rspamd_task *task, \ - struct token_node_s *tok, \ - struct rspamd_token_result *res, \ + gboolean rspamd_##name##_process_tokens (struct rspamd_task *task, \ + GPtrArray *tokens, gint id, \ gpointer ctx); \ void rspamd_##name##_finalize_process (struct rspamd_task *task, \ gpointer runtime, \ gpointer ctx); \ - gboolean rspamd_##name##_learn_token (struct rspamd_task *task, \ - struct token_node_s *tok, \ - struct rspamd_token_result *res, \ + gboolean rspamd_##name##_learn_tokens (struct rspamd_task *task, \ + GPtrArray *tokens, gint id, \ gpointer ctx); \ void rspamd_##name##_finalize_learn (struct rspamd_task *task, \ gpointer runtime, \ @@ -104,7 +105,6 @@ struct rspamd_stat_backend { void rspamd_##name##_close (gpointer ctx) RSPAMD_STAT_BACKEND_DEF(mmaped_file); -RSPAMD_STAT_BACKEND_DEF(redis); RSPAMD_STAT_BACKEND_DEF(sqlite3); #endif /* BACKENDS_H_ */ diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c index 2daf6e6cb..5e0b176ef 100644 --- a/src/libstat/backends/mmaped_file.c +++ b/src/libstat/backends/mmaped_file.c @@ -30,14 +30,6 @@ /* Section types */ #define STATFILE_SECTION_COMMON 1 -#define STATFILE_SECTION_HEADERS 2 -#define STATFILE_SECTION_URLS 3 -#define STATFILE_SECTION_REGEXP 4 - -#define DEFAULT_STATFILE_INVALIDATE_TIME 30 -#define DEFAULT_STATFILE_INVALIDATE_JITTER 30 - -#define MMAPED_BACKEND_TYPE "mmap" /** * Common statfile header @@ -90,7 +82,8 @@ typedef struct { #else gchar filename[MAXPATHLEN]; /**< name of file */ #endif - gint fd; /**< descriptor */ + rspamd_mempool_t *pool; + gint fd; /**< descriptor */ void *map; /**< mmaped area */ off_t seek_pos; /**< current seek position */ struct stat_file_section cur_section; /**< current section */ @@ -98,34 +91,23 @@ typedef struct { struct rspamd_statfile_config *cf; } rspamd_mmaped_file_t; -/** - * Statfiles pool - */ -typedef struct { - GHashTable *files; /**< hash table of opened files indexed by name */ - rspamd_mempool_t *pool; /**< memory pool object */ - rspamd_mempool_mutex_t *lock; /**< mutex */ - gboolean mlock_ok; /**< whether it is possible to use mlock (2) to avoid statfiles unloading */ -} rspamd_mmaped_file_ctx; #define RSPAMD_STATFILE_VERSION {'1', '2'} #define BACKUP_SUFFIX ".old" static void rspamd_mmaped_file_set_block_common (rspamd_mempool_t *pool, - rspamd_mmaped_file_ctx *statfiles_pool, rspamd_mmaped_file_t *file, + rspamd_mmaped_file_t *file, guint32 h1, guint32 h2, double value); -rspamd_mmaped_file_t * rspamd_mmaped_file_is_open ( - rspamd_mmaped_file_ctx * pool, struct rspamd_statfile_config *stcf); -rspamd_mmaped_file_t * rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool, - const gchar *filename, size_t size, struct rspamd_statfile_config *stcf); -gint rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, - const gchar *filename, size_t size, struct rspamd_statfile_config *stcf, +rspamd_mmaped_file_t * rspamd_mmaped_file_open (rspamd_mempool_t *pool, + const gchar *filename, size_t size, + struct rspamd_statfile_config *stcf); +gint rspamd_mmaped_file_create (const gchar *filename, size_t size, + struct rspamd_statfile_config *stcf, rspamd_mempool_t *pool); double -rspamd_mmaped_file_get_block (rspamd_mmaped_file_ctx * pool, - rspamd_mmaped_file_t * file, +rspamd_mmaped_file_get_block (rspamd_mmaped_file_t * file, guint32 h1, guint32 h2) { @@ -159,8 +141,8 @@ rspamd_mmaped_file_get_block (rspamd_mmaped_file_ctx * pool, static void rspamd_mmaped_file_set_block_common (rspamd_mempool_t *pool, - rspamd_mmaped_file_ctx *statfiles_pool, rspamd_mmaped_file_t *file, - guint32 h1, guint32 h2, double value) + rspamd_mmaped_file_t *file, + guint32 h1, guint32 h2, double value) { struct stat_file_block *block, *to_expire = NULL; struct stat_file_header *header; @@ -240,24 +222,14 @@ rspamd_mmaped_file_set_block_common (rspamd_mempool_t *pool, void rspamd_mmaped_file_set_block (rspamd_mempool_t *pool, - rspamd_mmaped_file_ctx * statfile_pool, rspamd_mmaped_file_t * file, guint32 h1, guint32 h2, double value) { - rspamd_mmaped_file_set_block_common (pool, statfile_pool, file, h1, h2, value); -} - -rspamd_mmaped_file_t * -rspamd_mmaped_file_is_open (rspamd_mmaped_file_ctx * pool, - struct rspamd_statfile_config *stcf) -{ - return g_hash_table_lookup (pool->files, stcf); + rspamd_mmaped_file_set_block_common (pool, file, h1, h2, value); } - - gboolean rspamd_mmaped_file_set_revision (rspamd_mmaped_file_t *file, guint64 rev, time_t time) { @@ -421,11 +393,11 @@ rspamd_mmaped_file_check (rspamd_mempool_t *pool, rspamd_mmaped_file_t * file) static rspamd_mmaped_file_t * -rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, - const gchar *filename, - size_t old_size, - size_t size, - struct rspamd_statfile_config *stcf) +rspamd_mmaped_file_reindex (rspamd_mempool_t *pool, + const gchar *filename, + size_t old_size, + size_t size, + struct rspamd_statfile_config *stcf) { gchar *backup, *lock; gint fd, lock_fd; @@ -433,7 +405,6 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, u_char *map, *pos; struct stat_file_block *block; struct stat_file_header *header, *nh; - rspamd_mempool_t *pool = statfiles_pool->pool; if (size < sizeof (struct stat_file_header) + sizeof (struct stat_file_section) + @@ -454,11 +425,11 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, if (!rspamd_file_lock (lock_fd, FALSE)) { g_free (lock); - return rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf); + return rspamd_mmaped_file_open (pool, filename, size, stcf); } } else { - return rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf); + return rspamd_mmaped_file_open (pool, filename, size, stcf); } } @@ -467,7 +438,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, unlink (lock); g_free (lock); - return rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf); + return rspamd_mmaped_file_open (pool, filename, size, stcf); } @@ -485,7 +456,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, } /* Now create new file with required size */ - if (rspamd_mmaped_file_create (statfiles_pool, filename, size, stcf, statfiles_pool->pool) != 0) { + if (rspamd_mmaped_file_create (filename, size, stcf, pool) != 0) { msg_err_pool ("cannot create new file"); g_free (backup); unlink (lock); @@ -497,7 +468,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, } /* Now open new file and start copying */ fd = open (backup, O_RDONLY); - new = rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf); + new = rspamd_mmaped_file_open (pool, filename, size, stcf); if (fd == -1 || new == NULL) { if (fd != -1) { @@ -532,8 +503,8 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool, while (old_size - (pos - map) >= sizeof (struct stat_file_block)) { block = (struct stat_file_block *)pos; if (block->hash1 != 0 && block->value != 0) { - rspamd_mmaped_file_set_block_common (statfiles_pool->pool, - statfiles_pool, new, block->hash1, + rspamd_mmaped_file_set_block_common (pool, + new, block->hash1, block->hash2, block->value); } pos += sizeof (block); @@ -593,17 +564,13 @@ rspamd_mmaped_file_preload (rspamd_mmaped_file_t *file) } rspamd_mmaped_file_t * -rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool, +rspamd_mmaped_file_open (rspamd_mempool_t *pool, const gchar *filename, size_t size, struct rspamd_statfile_config *stcf) { struct stat st; rspamd_mmaped_file_t *new_file; - rspamd_mempool_t *pool = statfiles_pool->pool; - if ((new_file = rspamd_mmaped_file_is_open (statfiles_pool, stcf)) != NULL) { - return new_file; - } if (stat (filename, &st) == -1) { msg_info_pool ("cannot stat file %s, error %s, %d", filename, strerror ( @@ -615,7 +582,7 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool, && size > sizeof (struct stat_file)) { msg_warn_pool ("need to reindex statfile old size: %Hz, new size: %Hz", (size_t)st.st_size, size); - return rspamd_mmaped_file_reindex (statfiles_pool, filename, st.st_size, size, stcf); + return rspamd_mmaped_file_reindex (pool, filename, st.st_size, size, stcf); } else if (size < sizeof (struct stat_file)) { msg_err_pool ("requested to shrink statfile to %Hz but it is too small", @@ -648,17 +615,10 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool, rspamd_strlcpy (new_file->filename, filename, sizeof (new_file->filename)); new_file->len = st.st_size; /* Try to lock pages in RAM */ - if (statfiles_pool->mlock_ok) { - if (mlock (new_file->map, new_file->len) == -1) { - msg_warn_pool ( - "mlock of statfile failed, maybe you need to increase RLIMIT_MEMLOCK limit for a process: %s", - strerror (errno)); - statfiles_pool->mlock_ok = FALSE; - } - } + /* Acquire lock for this operation */ rspamd_file_lock (new_file->fd, FALSE); - if (rspamd_mmaped_file_check (statfiles_pool->pool, new_file) == -1) { + if (rspamd_mmaped_file_check (pool, new_file) == -1) { rspamd_file_unlock (new_file->fd, FALSE); munmap (new_file->map, st.st_size); g_slice_free1 (sizeof (*new_file), new_file); @@ -673,23 +633,13 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool, g_assert (stcf->clcf != NULL); - g_hash_table_insert (statfiles_pool->files, stcf, new_file); - return new_file; } gint -rspamd_mmaped_file_close_file (rspamd_mmaped_file_ctx *statfile_pool, +rspamd_mmaped_file_close_file (rspamd_mempool_t *pool, rspamd_mmaped_file_t * file) { - rspamd_mmaped_file_t *pos; - rspamd_mempool_t *pool = statfile_pool->pool; - - if ((pos = rspamd_mmaped_file_is_open (statfile_pool, file->cf)) == NULL) { - msg_info_pool ("file %s is not opened", file->filename); - return -1; - } - if (file->map) { msg_info_pool ("syncing statfile %s", file->filename); msync (file->map, file->len, MS_ASYNC); @@ -705,8 +655,10 @@ rspamd_mmaped_file_close_file (rspamd_mmaped_file_ctx *statfile_pool, } gint -rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, const gchar *filename, - size_t size, struct rspamd_statfile_config *stcf, rspamd_mempool_t *pool) +rspamd_mmaped_file_create (const gchar *filename, + size_t size, + struct rspamd_statfile_config *stcf, + rspamd_mempool_t *pool) { struct stat_file_header header = { .magic = {'r', 's', 'd'}, @@ -727,11 +679,6 @@ rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, const gchar *f gpointer tok_conf; gsize tok_conf_len; - if (rspamd_mmaped_file_is_open (statfile_pool, stcf) != NULL) { - msg_info_pool ("file %s is already opened", filename); - return 0; - } - if (size < sizeof (struct stat_file_header) + sizeof (struct stat_file_section) + sizeof (block)) { @@ -838,103 +785,51 @@ rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, const gchar *f } gpointer -rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg) +rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx, + struct rspamd_config *cfg, struct rspamd_statfile *st) { - rspamd_mmaped_file_ctx *new; - struct rspamd_classifier_config *clf; - struct rspamd_statfile_config *stf; - GList *cur, *curst; + struct rspamd_statfile_config *stf = st->stcf; + rspamd_mmaped_file_t *mf; const ucl_object_t *filenameo, *sizeo; const gchar *filename; gsize size; - new = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (rspamd_mmaped_file_ctx)); - new->pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), "statfiles"); - memcpy (new->pool->tag.uid, cfg->cfg_pool->tag.uid, sizeof (new->pool->tag.uid)); - new->lock = rspamd_mempool_get_mutex (new->pool); - new->mlock_ok = cfg->mlock_statfile_pool; - new->files = g_hash_table_new (g_direct_hash, g_direct_equal); - - /* Iterate over all classifiers and load matching statfiles */ - cur = cfg->classifiers; + filenameo = ucl_object_find_key (stf->opts, "filename"); - while (cur) { - clf = cur->data; + if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { + filenameo = ucl_object_find_key (stf->opts, "path"); - curst = clf->statfiles; - - if (clf->backend == NULL) { - /* - * By default, all statfiles are treated as mmaped files - */ - clf->backend = MMAPED_BACKEND_TYPE; + if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { + msg_err_config ("statfile %s has no filename defined", stf->symbol); + return NULL; } + } - if (strcmp (clf->backend, MMAPED_BACKEND_TYPE) == 0) { - while (curst) { - stf = curst->data; - /* - * Check configuration sanity - */ - filenameo = ucl_object_find_key (stf->opts, "filename"); - - if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { - filenameo = ucl_object_find_key (stf->opts, "path"); - - if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { - msg_err_config ("statfile %s has no filename defined", stf->symbol); - curst = curst->next; - continue; - } - } - - filename = ucl_object_tostring (filenameo); - - sizeo = ucl_object_find_key (stf->opts, "size"); - - if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) { - msg_err_config ("statfile %s has no size defined", stf->symbol); - curst = curst->next; - continue; - } - - size = ucl_object_toint (sizeo); - - rspamd_mmaped_file_open (new, filename, size, stf); + filename = ucl_object_tostring (filenameo); - ctx->statfiles ++; + sizeo = ucl_object_find_key (stf->opts, "size"); - curst = curst->next; - } - } - - cur = g_list_next (cur); + if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) { + msg_err_config ("statfile %s has no size defined", stf->symbol); + return NULL; } - return (gpointer)new; + size = ucl_object_toint (sizeo); + mf = rspamd_mmaped_file_open (cfg->cfg_pool, filename, size, stf); + mf->pool = cfg->cfg_pool; + + return (gpointer)mf; } void rspamd_mmaped_file_close (gpointer p) { - rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p; - GHashTableIter it; - gpointer k, v; - rspamd_mmaped_file_t *mf; + rspamd_mmaped_file_t *mf = p; - g_assert (ctx != NULL); + g_assert (p != NULL); - rspamd_mempool_lock_mutex (ctx->lock); - g_hash_table_iter_init (&it, ctx->files); + rspamd_mmaped_file_close_file (mf->pool, mf); - while (g_hash_table_iter_next (&it, &k, &v)) { - mf = v; - rspamd_mmaped_file_close_file (ctx, mf); - } - - g_hash_table_unref (ctx->files); - rspamd_mempool_unlock_mutex (ctx->lock); - /* XXX: we don't delete pool here to avoid deadlocks */ } gpointer @@ -943,109 +838,55 @@ rspamd_mmaped_file_runtime (struct rspamd_task *task, gboolean learn, gpointer p) { - rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p; - rspamd_mmaped_file_t *mf; - const ucl_object_t *filenameo, *sizeo; - const gchar *filename; - gsize size; - - g_assert (ctx != NULL); - - mf = rspamd_mmaped_file_is_open (ctx, stcf); - - if (mf == NULL) { - /* Create file here */ - - filenameo = ucl_object_find_key (stcf->opts, "filename"); - if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { - filenameo = ucl_object_find_key (stcf->opts, "path"); - if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { - msg_err_task ("statfile %s has no filename defined", stcf->symbol); - return NULL; - } - } - - filename = ucl_object_tostring (filenameo); - - sizeo = ucl_object_find_key (stcf->opts, "size"); - if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) { - msg_err_task ("statfile %s has no size defined", stcf->symbol); - return NULL; - } - - size = ucl_object_toint (sizeo); - - if (learn) { - rspamd_mmaped_file_create (ctx, filename, size, stcf, task->task_pool); - } - - mf = rspamd_mmaped_file_open (ctx, filename, size, stcf); - } + rspamd_mmaped_file_t *mf = p; return (gpointer)mf; } gboolean -rspamd_mmaped_file_process_token (struct rspamd_task *task, rspamd_token_t *tok, - struct rspamd_token_result *res, +rspamd_mmaped_file_process_tokens (struct rspamd_task *task, GPtrArray *tokens, + gint id, gpointer p) { - rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p; - rspamd_mmaped_file_t *mf; + rspamd_mmaped_file_t *mf = p; guint32 h1, h2; + rspamd_token_t *tok; + guint i; - g_assert (res != NULL); + g_assert (tokens != NULL); g_assert (p != NULL); - g_assert (res->st_runtime != NULL); - g_assert (tok != NULL); - g_assert (tok->datalen >= sizeof (guint32) * 2); - mf = (rspamd_mmaped_file_t *)res->st_runtime->backend_runtime; - - if (mf == NULL) { - /* Statfile is does not exist, so all values are zero */ - res->value = 0.0; - return FALSE; - } - - memcpy (&h1, tok->data, sizeof (h1)); - memcpy (&h2, tok->data + sizeof (h1), sizeof (h2)); - res->value = rspamd_mmaped_file_get_block (ctx, mf, h1, h2); - - if (res->value > 0.0) { - return TRUE; + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index (tokens, i); + memcpy (&h1, tok->data, sizeof (h1)); + memcpy (&h2, tok->data + sizeof (h1), sizeof (h2)); + tok->values[id] = rspamd_mmaped_file_get_block (mf, h1, h2); } - return FALSE; + return TRUE; } gboolean -rspamd_mmaped_file_learn_token (struct rspamd_task *task, rspamd_token_t *tok, - struct rspamd_token_result *res, +rspamd_mmaped_file_learn_tokens (struct rspamd_task *task, GPtrArray *tokens, + gint id, gpointer p) { - rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p; - rspamd_mmaped_file_t *mf; + rspamd_mmaped_file_t *mf = p; guint32 h1, h2; + rspamd_token_t *tok; + guint i; - g_assert (res != NULL); + g_assert (tokens != NULL); g_assert (p != NULL); - g_assert (res->st_runtime != NULL); - g_assert (tok != NULL); - g_assert (tok->datalen >= sizeof (guint32) * 2); - - mf = (rspamd_mmaped_file_t *)res->st_runtime->backend_runtime; - if (mf == NULL) { - /* Statfile is does not exist, so all values are zero */ - res->value = 0.0; - return FALSE; + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index (tokens, i); + memcpy (&h1, tok->data, sizeof (h1)); + memcpy (&h2, tok->data + sizeof (h1), sizeof (h2)); + rspamd_mmaped_file_set_block (task->task_pool, mf, h1, h2, + tok->values[id]); } - memcpy (&h1, tok->data, sizeof (h1)); - memcpy (&h2, tok->data + sizeof (h1), sizeof (h2)); - rspamd_mmaped_file_set_block (task->task_pool, ctx, mf, h1, h2, res->value); - return TRUE; } diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c index 7b9cc8a42..3e05bee80 100644 --- a/src/libstat/backends/sqlite3_backend.c +++ b/src/libstat/backends/sqlite3_backend.c @@ -36,10 +36,11 @@ #define SQLITE3_DEFAULT "default" struct rspamd_stat_sqlite3_db { - struct rspamd_stat_sqlite3_ctx *ctx; sqlite3 *sqlite; gchar *fname; GArray *prstmt; + lua_State *L; + rspamd_mempool_t *pool; gboolean in_transaction; gboolean enable_users; gboolean enable_languages; @@ -47,14 +48,7 @@ struct rspamd_stat_sqlite3_db { gint cbref_language; }; -struct rspamd_stat_sqlite3_ctx { - GHashTable *files; - rspamd_mempool_t *pool; - lua_State *L; -}; - struct rspamd_stat_sqlite3_rt { - struct rspamd_stat_sqlite3_ctx *ctx; struct rspamd_task *task; struct rspamd_stat_sqlite3_db *db; struct rspamd_statfile_config *cf; @@ -311,7 +305,7 @@ rspamd_sqlite3_get_user (struct rspamd_stat_sqlite3_db *db, const gchar *user = NULL; const InternetAddress *ia; struct rspamd_task **ptask; - lua_State *L = db->ctx->L; + lua_State *L = db->L; GString *tb; if (db->cbref_user == -1) { @@ -391,7 +385,7 @@ rspamd_sqlite3_get_language (struct rspamd_stat_sqlite3_db *db, const gchar *language = NULL; struct mime_text_part *tp; struct rspamd_task **ptask; - lua_State *L = db->ctx->L; + lua_State *L = db->L; GString *tb; if (db->cbref_language == -1) { @@ -471,6 +465,7 @@ rspamd_sqlite3_opendb (rspamd_mempool_t *pool, bk = g_slice_alloc0 (sizeof (*bk)); bk->sqlite = rspamd_sqlite3_open_or_create (pool, path, create_tables_sql, err); + bk->pool = pool; if (bk->sqlite == NULL) { g_slice_free1 (sizeof (*bk), bk); @@ -545,194 +540,146 @@ rspamd_sqlite3_opendb (rspamd_mempool_t *pool, gpointer rspamd_sqlite3_init (struct rspamd_stat_ctx *ctx, - struct rspamd_config *cfg) + struct rspamd_config *cfg, + struct rspamd_statfile *st) { - struct rspamd_stat_sqlite3_ctx *new; - struct rspamd_classifier_config *clf; - struct rspamd_statfile_config *stf; - GList *cur, *curst; + struct rspamd_classifier_config *clf = st->classifier->cfg; + struct rspamd_statfile_config *stf = st->stcf; const ucl_object_t *filenameo, *lang_enabled, *users_enabled; const gchar *filename, *lua_script; struct rspamd_stat_sqlite3_db *bk; GError *err = NULL; - new = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*new)); - new->files = g_hash_table_new (g_direct_hash, g_direct_equal); - new->pool = cfg->cfg_pool; - new->L = cfg->lua_state; - - /* Iterate over all classifiers and load matching statfiles */ - cur = cfg->classifiers; - - while (cur) { - clf = cur->data; - if (clf->backend && strcmp (clf->backend, SQLITE3_BACKEND_TYPE) == 0) { - curst = clf->statfiles; - - while (curst) { - stf = curst->data; - /* - * Check configuration sanity - */ - filenameo = ucl_object_find_key (stf->opts, "filename"); - if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { - filenameo = ucl_object_find_key (stf->opts, "path"); - if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { - msg_err_config ("statfile %s has no filename defined", stf->symbol); - curst = curst->next; - continue; - } - } + filenameo = ucl_object_find_key (stf->opts, "filename"); + if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { + filenameo = ucl_object_find_key (stf->opts, "path"); + if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) { + msg_err_config ("statfile %s has no filename defined", stf->symbol); + return NULL; + } + } - filename = ucl_object_tostring (filenameo); + filename = ucl_object_tostring (filenameo); - if ((bk = rspamd_sqlite3_opendb (cfg->cfg_pool, stf, filename, - stf->opts, TRUE, &err)) == NULL) { - msg_err_config ("cannot open sqlite3 db: %e", err); - } + if ((bk = rspamd_sqlite3_opendb (cfg->cfg_pool, stf, filename, + stf->opts, TRUE, &err)) == NULL) { + msg_err_config ("cannot open sqlite3 db: %e", err); + g_error_free (err); + return NULL; + } - if (bk != NULL) { - bk->ctx = new; - g_hash_table_insert (new->files, stf, bk); - } - else { - g_error_free (err); - err = NULL; - curst = curst->next; - continue; - } + bk->L = cfg->lua_state; - users_enabled = ucl_object_find_any_key (clf->opts, "per_user", - "users_enabled", NULL); - if (users_enabled != NULL) { - if (ucl_object_type (users_enabled) == UCL_BOOLEAN) { - bk->enable_users = ucl_object_toboolean (users_enabled); - bk->cbref_user = -1; - } - else if (ucl_object_type (users_enabled) == UCL_STRING) { - lua_script = ucl_object_tostring (users_enabled); - - if (luaL_dostring (new->L, lua_script) != 0) { - msg_err_config ("cannot execute lua script for users " - "extraction: %s", lua_tostring (new->L, -1)); - } - else { - if (lua_type (new->L, -1) == LUA_TFUNCTION) { - bk->enable_users = TRUE; - bk->cbref_user = luaL_ref (new->L, - LUA_REGISTRYINDEX); - } - else { - msg_err_config ("lua script must return " - "function(task) and not %s", - lua_typename (new->L, lua_type (new->L, -1))); - } - } - } - } - else { - bk->enable_users = FALSE; - } + users_enabled = ucl_object_find_any_key (clf->opts, "per_user", + "users_enabled", NULL); + if (users_enabled != NULL) { + if (ucl_object_type (users_enabled) == UCL_BOOLEAN) { + bk->enable_users = ucl_object_toboolean (users_enabled); + bk->cbref_user = -1; + } + else if (ucl_object_type (users_enabled) == UCL_STRING) { + lua_script = ucl_object_tostring (users_enabled); - lang_enabled = ucl_object_find_any_key (clf->opts, - "per_language", "languages_enabled", NULL); - if (lang_enabled != NULL) { - if (ucl_object_type (lang_enabled) == UCL_BOOLEAN) { - bk->enable_languages = ucl_object_toboolean (lang_enabled); - bk->cbref_language = -1; - } - else if (ucl_object_type (lang_enabled) == UCL_STRING) { - lua_script = ucl_object_tostring (lang_enabled); - - if (luaL_dostring (new->L, lua_script) != 0) { - msg_err_config ( - "cannot execute lua script for languages " - "extraction: %s", - lua_tostring (new->L, -1)); - } - else { - if (lua_type (new->L, -1) == LUA_TFUNCTION) { - bk->enable_languages = TRUE; - bk->cbref_language = luaL_ref (new->L, - LUA_REGISTRYINDEX); - } - else { - msg_err_config ("lua script must return " - "function(task) and not %s", - lua_typename (new->L, - lua_type (new->L, -1))); - } - } - } + if (luaL_dostring (cfg->lua_state, lua_script) != 0) { + msg_err_config ("cannot execute lua script for users " + "extraction: %s", lua_tostring (cfg->lua_state, -1)); + } + else { + if (lua_type (cfg->lua_state, -1) == LUA_TFUNCTION) { + bk->enable_users = TRUE; + bk->cbref_user = luaL_ref (cfg->lua_state, + LUA_REGISTRYINDEX); } else { - bk->enable_languages = FALSE; + msg_err_config ("lua script must return " + "function(task) and not %s", + lua_typename (cfg->lua_state, lua_type ( + cfg->lua_state, -1))); } + } + } + } + else { + bk->enable_users = FALSE; + } - if (bk->enable_languages) { - msg_info_config ("enable per language statistics for %s", - stf->symbol); - } + lang_enabled = ucl_object_find_any_key (clf->opts, + "per_language", "languages_enabled", NULL); - if (bk->enable_users) { - msg_info_config ("enable per users statistics for %s", - stf->symbol); + if (lang_enabled != NULL) { + if (ucl_object_type (lang_enabled) == UCL_BOOLEAN) { + bk->enable_languages = ucl_object_toboolean (lang_enabled); + bk->cbref_language = -1; + } + else if (ucl_object_type (lang_enabled) == UCL_STRING) { + lua_script = ucl_object_tostring (lang_enabled); + + if (luaL_dostring (cfg->lua_state, lua_script) != 0) { + msg_err_config ( + "cannot execute lua script for languages " + "extraction: %s", + lua_tostring (cfg->lua_state, -1)); + } + else { + if (lua_type (cfg->lua_state, -1) == LUA_TFUNCTION) { + bk->enable_languages = TRUE; + bk->cbref_language = luaL_ref (cfg->lua_state, + LUA_REGISTRYINDEX); + } + else { + msg_err_config ("lua script must return " + "function(task) and not %s", + lua_typename (cfg->lua_state, + lua_type (cfg->lua_state, -1))); } - - ctx->statfiles ++; - - curst = curst->next; } } + } + else { + bk->enable_languages = FALSE; + } - cur = g_list_next (cur); + if (bk->enable_languages) { + msg_info_config ("enable per language statistics for %s", + stf->symbol); } - return (gpointer)new; + if (bk->enable_users) { + msg_info_config ("enable per users statistics for %s", + stf->symbol); + } + + + return (gpointer) bk; } void rspamd_sqlite3_close (gpointer p) { - struct rspamd_stat_sqlite3_ctx *ctx = p; - struct rspamd_stat_sqlite3_db *bk; - GHashTableIter it; - gpointer k, v; - - g_hash_table_iter_init (&it, ctx->files); - - while (g_hash_table_iter_next (&it, &k, &v)) { - bk = v; + struct rspamd_stat_sqlite3_db *bk = p; - if (bk->sqlite) { - if (bk->in_transaction) { - rspamd_sqlite3_run_prstmt (ctx->pool, bk->sqlite, bk->prstmt, - RSPAMD_STAT_BACKEND_TRANSACTION_COMMIT); - } - - rspamd_sqlite3_close_prstmt (bk->sqlite, bk->prstmt); - sqlite3_close (bk->sqlite); - g_free (bk->fname); - g_slice_free1 (sizeof (*bk), bk); + if (bk->sqlite) { + if (bk->in_transaction) { + rspamd_sqlite3_run_prstmt (bk->pool, bk->sqlite, bk->prstmt, + RSPAMD_STAT_BACKEND_TRANSACTION_COMMIT); } - } - g_hash_table_destroy (ctx->files); + rspamd_sqlite3_close_prstmt (bk->sqlite, bk->prstmt); + sqlite3_close (bk->sqlite); + g_free (bk->fname); + g_slice_free1 (sizeof (*bk), bk); + } } gpointer rspamd_sqlite3_runtime (struct rspamd_task *task, struct rspamd_statfile_config *stcf, gboolean learn, gpointer p) { - struct rspamd_stat_sqlite3_ctx *ctx = p; struct rspamd_stat_sqlite3_rt *rt = NULL; - struct rspamd_stat_sqlite3_db *bk; - - bk = g_hash_table_lookup (ctx->files, stcf); + struct rspamd_stat_sqlite3_db *bk = p; if (bk) { rt = rspamd_mempool_alloc (task->task_pool, sizeof (*rt)); - rt->ctx = ctx; rt->db = bk; rt->task = task; rt->user_id = -1; @@ -744,66 +691,64 @@ rspamd_sqlite3_runtime (struct rspamd_task *task, } gboolean -rspamd_sqlite3_process_token (struct rspamd_task *task, struct token_node_s *tok, - struct rspamd_token_result *res, gpointer p) +rspamd_sqlite3_process_tokens (struct rspamd_task *task, + GPtrArray *tokens, + gint id, gpointer p) { struct rspamd_stat_sqlite3_db *bk; - struct rspamd_stat_sqlite3_rt *rt; + struct rspamd_stat_sqlite3_rt *rt = p; gint64 iv = 0, idx; + guint i; + rspamd_token_t *tok; - g_assert (res != NULL); g_assert (p != NULL); - g_assert (res->st_runtime != NULL); - g_assert (tok != NULL); - g_assert (tok->datalen >= sizeof (guint32) * 2); + g_assert (tokens != NULL); - rt = res->st_runtime->backend_runtime; bk = rt->db; - if (bk == NULL) { - /* Statfile is does not exist, so all values are zero */ - res->value = 0.0; - return FALSE; - } - - if (!bk->in_transaction) { - rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, - RSPAMD_STAT_BACKEND_TRANSACTION_START_DEF); - bk->in_transaction = TRUE; - } + for (i = 0; i < tokens->len; i ++) { + tok = g_ptr_array_index (tokens, i); - if (rt->user_id == -1) { - if (bk->enable_users) { - rt->user_id = rspamd_sqlite3_get_user (bk, task, FALSE); - } - else { - rt->user_id = 0; + if (bk == NULL) { + /* Statfile is does not exist, so all values are zero */ + tok->values[id] = 0.0; + continue; } - } - if (rt->lang_id == -1) { - if (bk->enable_languages) { - rt->lang_id = rspamd_sqlite3_get_language (bk, task, FALSE); + if (!bk->in_transaction) { + rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, + RSPAMD_STAT_BACKEND_TRANSACTION_START_DEF); + bk->in_transaction = TRUE; } - else { - rt->lang_id = 0; + + if (rt->user_id == -1) { + if (bk->enable_users) { + rt->user_id = rspamd_sqlite3_get_user (bk, task, FALSE); + } + else { + rt->user_id = 0; + } } - } - memcpy (&idx, tok->data, sizeof (idx)); + if (rt->lang_id == -1) { + if (bk->enable_languages) { + rt->lang_id = rspamd_sqlite3_get_language (bk, task, FALSE); + } + else { + rt->lang_id = 0; + } + } - if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, - RSPAMD_STAT_BACKEND_GET_TOKEN, - idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) { - res->value = iv; + memcpy (&idx, tok->data, sizeof (idx)); - if (iv == 0) { - return FALSE; + if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, + RSPAMD_STAT_BACKEND_GET_TOKEN, + idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) { + tok->values[id] = iv; + } + else { + tok->values[id] = 0.0; } - } - else { - res->value = 0.0; - return FALSE; } @@ -833,58 +778,63 @@ rspamd_sqlite3_finalize_process (struct rspamd_task *task, gpointer runtime, } gboolean -rspamd_sqlite3_learn_token (struct rspamd_task *task, struct token_node_s *tok, - struct rspamd_token_result *res, gpointer p) +rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens, + gint id, gpointer p) { struct rspamd_stat_sqlite3_db *bk; - struct rspamd_stat_sqlite3_rt *rt; + struct rspamd_stat_sqlite3_rt *rt = p; gint64 iv = 0, idx; + guint i; + rspamd_token_t *tok; - g_assert (res != NULL); + g_assert (tokens != NULL); g_assert (p != NULL); - g_assert (res->st_runtime != NULL); - g_assert (tok != NULL); - g_assert (tok->datalen >= sizeof (guint32) * 2); - rt = res->st_runtime->backend_runtime; bk = rt->db; - if (bk == NULL) { - /* Statfile is does not exist, so all values are zero */ - return FALSE; - } - - if (!bk->in_transaction) { - rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, - RSPAMD_STAT_BACKEND_TRANSACTION_START_IM); - bk->in_transaction = TRUE; - } - - if (rt->user_id == -1) { - if (bk->enable_users) { - rt->user_id = rspamd_sqlite3_get_user (bk, task, TRUE); + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index (tokens, i); + if (bk == NULL) { + /* Statfile is does not exist, so all values are zero */ + return FALSE; } - else { - rt->user_id = 0; + + if (!bk->in_transaction) { + rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, + RSPAMD_STAT_BACKEND_TRANSACTION_START_IM); + bk->in_transaction = TRUE; } - } - if (rt->lang_id == -1) { - if (bk->enable_languages) { - rt->lang_id = rspamd_sqlite3_get_language (bk, task, TRUE); + if (rt->user_id == -1) { + if (bk->enable_users) { + rt->user_id = rspamd_sqlite3_get_user (bk, task, TRUE); + } + else { + rt->user_id = 0; + } } - else { - rt->lang_id = 0; + + if (rt->lang_id == -1) { + if (bk->enable_languages) { + rt->lang_id = rspamd_sqlite3_get_language (bk, task, TRUE); + } + else { + rt->lang_id = 0; + } } - } - iv = res->value; - memcpy (&idx, tok->data, sizeof (idx)); + iv = tok->values[id]; + memcpy (&idx, tok->data, sizeof (idx)); + + if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, + RSPAMD_STAT_BACKEND_SET_TOKEN, + idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) { + rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, + RSPAMD_STAT_BACKEND_TRANSACTION_ROLLBACK); + bk->in_transaction = FALSE; - if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, - RSPAMD_STAT_BACKEND_SET_TOKEN, - idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) { - return FALSE; + return FALSE; + } } return TRUE; @@ -1024,7 +974,7 @@ rspamd_sqlite3_get_stat (gpointer runtime, g_assert (rt != NULL); bk = rt->db; - pool = rt->ctx->pool; + pool = bk->pool; (void)stat (bk->fname, &st); rspamd_sqlite3_run_prstmt (pool, bk->sqlite, bk->prstmt, @@ -1072,7 +1022,7 @@ rspamd_sqlite3_load_tokenizer_config (gpointer runtime, g_assert (rt != NULL); bk = rt->db; - g_assert (rspamd_sqlite3_run_prstmt (rt->ctx->pool, bk->sqlite, bk->prstmt, + g_assert (rspamd_sqlite3_run_prstmt (rt->db->pool, bk->sqlite, bk->prstmt, RSPAMD_STAT_BACKEND_LOAD_TOKENIZER, &sz, &tk_conf) == SQLITE_OK); g_assert (sz > 0); /* diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 180aa4658..0915933f1 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -90,7 +90,10 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg) } struct bayes_task_closure { - struct rspamd_classifier_runtime *rt; + double ham_prob; + double spam_prob; + guint64 processed_tokens; + guint64 total_hits; struct rspamd_task *task; }; @@ -104,44 +107,46 @@ static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 } /* * In this callback we calculate local probabilities for tokens */ -static gboolean -bayes_classify_callback (gpointer key, gpointer value, gpointer data) +static void +bayes_classify_token (struct rspamd_classifier *ctx, + rspamd_token_t *tok, struct bayes_task_closure *cl) { - rspamd_token_t *node = value; - struct bayes_task_closure *cl = data; - struct rspamd_classifier_runtime *rt; guint i; - struct rspamd_token_result *res; + gint id; guint64 spam_count = 0, ham_count = 0, total_count = 0; + struct rspamd_statfile *st; struct rspamd_task *task; double spam_prob, spam_freq, ham_freq, bayes_spam_prob, bayes_ham_prob, - ham_prob, fw, w, norm_sum, norm_sub; + ham_prob, fw, w, norm_sum, norm_sub, val; - rt = cl->rt; task = cl->task; - for (i = rt->start_pos; i < rt->end_pos; i++) { - res = &g_array_index (node->results, struct rspamd_token_result, i); + for (i = 0; i < ctx->statfiles_ids->len; i++) { + id = g_array_index (ctx->statfiles_ids, gint, i); + st = g_ptr_array_index (ctx->ctx->statfiles, id); + g_assert (st != NULL); + val = tok->values[id]; - if (res->value > 0) { - if (res->st_runtime->st->is_spam) { - spam_count += res->value; + if (val > 0) { + if (st->stcf->is_spam) { + spam_count += val; } else { - ham_count += res->value; + ham_count += val; } - total_count += res->value; - res->st_runtime->total_hits += res->value; + + total_count += val; + cl->total_hits += val; } } /* Probability for this token */ if (total_count > 0) { - spam_freq = ((double)spam_count / MAX (1., (double)rt->total_spam)); - ham_freq = ((double)ham_count / MAX (1., (double)rt->total_ham)); + spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns)); + ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns)); spam_prob = spam_freq / (spam_freq + ham_freq); ham_prob = ham_freq / (spam_freq + ham_freq); - fw = feature_weight[node->window_idx % G_N_ELEMENTS (feature_weight)]; + fw = feature_weight[tok->window_idx % G_N_ELEMENTS (feature_weight)]; norm_sum = (spam_freq + ham_freq) * (spam_freq + ham_freq); norm_sub = (spam_freq - ham_freq) * (spam_freq - ham_freq); w = (norm_sub) / (norm_sum) * @@ -151,9 +156,9 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) w = (norm_sub) / (norm_sum) * (fw * total_count) / (4.0 * (1.0 + fw * total_count)); bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5); - rt->spam_prob += log (bayes_spam_prob); - rt->ham_prob += log (bayes_ham_prob); - res->cl_runtime->processed_tokens ++; + cl->spam_prob += log (bayes_spam_prob); + cl->ham_prob += log (bayes_ham_prob); + cl->processed_tokens ++; msg_debug_bayes ("token: weight: %f, total_count: %L, " "spam_count: %L, ham_count: %L," @@ -163,10 +168,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) fw, total_count, spam_count, ham_count, spam_prob, ham_prob, bayes_spam_prob, bayes_ham_prob, - rt->spam_prob, rt->ham_prob); + cl->spam_prob, cl->ham_prob); } - - return FALSE; } /* @@ -191,191 +194,153 @@ bayes_normalize_prob (gdouble x) return a*x4 + b*x3 + c*x2 + d*xx; } -struct classifier_ctx * -bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg) +void +bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier *cl) { - struct classifier_ctx *ctx = - rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); - - ctx->pool = pool; - ctx->cfg = cfg; - ctx->debug = FALSE; - - return ctx; } gboolean -bayes_classify (struct classifier_ctx * ctx, - GTree *input, - struct rspamd_classifier_runtime *rt, - struct rspamd_task *task) +bayes_classify (struct rspamd_classifier * ctx, + GPtrArray *tokens, + struct rspamd_task *task) { double final_prob, h, s; - guint maxhits = 0; - struct rspamd_statfile_runtime *st, *selected_st = NULL; - GList *cur; char *sumbuf; + struct rspamd_statfile *st = NULL; struct bayes_task_closure cl; + rspamd_token_t *tok; + guint i; + gint id; + GList *cur; g_assert (ctx != NULL); - g_assert (input != NULL); - g_assert (rt != NULL); - g_assert (rt->end_pos > rt->start_pos); - - if (rt->stage == RSPAMD_STAT_STAGE_PRE) { - cl.rt = rt; - cl.task = task; - g_tree_foreach (input, bayes_classify_callback, &cl); + g_assert (tokens != NULL); + + memset (&cl, 0, sizeof (cl)); + cl.task = task; + + for (i = 0; i < tokens->len; i ++) { + tok = g_ptr_array_index (tokens, i); + + bayes_classify_token (ctx, tok, &cl); + } + + h = 1 - inv_chi_square (task, cl.spam_prob, cl.processed_tokens); + s = 1 - inv_chi_square (task, cl.ham_prob, cl.processed_tokens); + + if (isfinite (s) && isfinite (h)) { + final_prob = (s + 1.0 - h) / 2.; + msg_debug_bayes ( + "<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," + " %L tokens processed of %ud total tokens", + task->message_id, + cl.ham_prob, + h, + cl.spam_prob, + s, + cl.processed_tokens, + tokens->len); } else { - h = 1 - inv_chi_square (task, rt->spam_prob, rt->processed_tokens); - s = 1 - inv_chi_square (task, rt->ham_prob, rt->processed_tokens); - - if (isfinite (s) && isfinite (h)) { - final_prob = (s + 1.0 - h) / 2.; - msg_debug_bayes ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," - " %L tokens processed of %ud total tokens", - task->message_id, rt->ham_prob, h, rt->spam_prob, s, - rt->processed_tokens, g_tree_nnodes (input)); + /* + * We have some overflow, hence we need to check which class + * is NaN + */ + if (isfinite (h)) { + final_prob = 1.0; + msg_debug_bayes ("<%s> spam class is overflowed, as we have no" + " ham samples", task->message_id); + } + else if (isfinite (s)) { + final_prob = 0.0; + msg_debug_bayes ("<%s> ham class is overflowed, as we have no" + " spam samples", task->message_id); } else { - /* - * We have some overflow, hence we need to check which class - * is NaN - */ - if (isfinite (h)) { - final_prob = 1.0; - msg_debug_bayes ("<%s> spam class is overflowed, as we have no" - " ham samples", task->message_id); - } - else if (isfinite (s)){ - final_prob = 0.0; - msg_debug_bayes ("<%s> ham class is overflowed, as we have no" - " spam samples", task->message_id); - } - else { - final_prob = 0.5; - msg_warn_bayes ("<%s> spam and ham classes are both overflowed", - task->message_id); - } + final_prob = 0.5; + msg_warn_bayes ("<%s> spam and ham classes are both overflowed", + task->message_id); } + } - if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { + if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { - sumbuf = rspamd_mempool_alloc (task->task_pool, 32); - cur = g_list_first (rt->st_runtime); + sumbuf = rspamd_mempool_alloc (task->task_pool, 32); - while (cur) { - st = (struct rspamd_statfile_runtime *)cur->data; + /* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */ + for (i = 0; i < ctx->statfiles_ids->len; i++) { + id = g_array_index (ctx->statfiles_ids, gint, i); + st = g_ptr_array_index (ctx->ctx->statfiles, id); - if ((final_prob < 0.5 && !st->st->is_spam) || - (final_prob > 0.5 && st->st->is_spam)) { - if (st->total_hits > maxhits) { - maxhits = st->total_hits; - selected_st = st; - } - } - - cur = g_list_next (cur); + if (final_prob > 0.5 && st->stcf->is_spam) { + break; } - - if (selected_st == NULL) { - msg_err_bayes ( - "unexpected classifier error: cannot select desired statfile, " - "prob: %.4f", final_prob); + else if (final_prob < 0.5 && !st->stcf->is_spam) { + break; } - else { - /* Correctly scale HAM */ - if (final_prob < 0.5) { - final_prob = 1.0 - final_prob; - } - - rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.); - final_prob = bayes_normalize_prob (final_prob); + } - cur = g_list_prepend (NULL, sumbuf); - rspamd_task_insert_result (task, - selected_st->st->symbol, - final_prob, - cur); - } + /* Correctly scale HAM */ + if (final_prob < 0.5) { + final_prob = 1.0 - final_prob; } + + rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.); + final_prob = bayes_normalize_prob (final_prob); + g_assert (st != NULL); + cur = g_list_prepend (NULL, sumbuf); + rspamd_task_insert_result (task, + st->stcf->symbol, + final_prob, + cur); } return TRUE; } -static gboolean -bayes_learn_spam_callback (gpointer key, gpointer value, gpointer data) +gboolean +bayes_learn_spam (struct rspamd_classifier * ctx, + GPtrArray *tokens, + struct rspamd_task *task, + gboolean is_spam, + GError **err) { - rspamd_token_t *node = value; - struct rspamd_token_result *res; - struct rspamd_classifier_runtime *rt = (struct rspamd_classifier_runtime *)data; - guint i; + guint i, j; + gint id; + struct rspamd_statfile *st; + rspamd_token_t *tok; + g_assert (ctx != NULL); + g_assert (tokens != NULL); - for (i = rt->start_pos; i < rt->end_pos; i++) { - res = &g_array_index (node->results, struct rspamd_token_result, i); - - if (res->st_runtime) { - if (res->st_runtime->st->is_spam) { - res->value ++; - } - else if (res->value > 0) { - /* Unlearning */ - res->value --; - } - } - } - - return FALSE; -} - -static gboolean -bayes_learn_ham_callback (gpointer key, gpointer value, gpointer data) -{ - rspamd_token_t *node = value; - struct rspamd_token_result *res; - struct rspamd_classifier_runtime *rt = (struct rspamd_classifier_runtime *)data; - guint i; - + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index (tokens, i); - for (i = rt->start_pos; i < rt->end_pos; i++) { - res = &g_array_index (node->results, struct rspamd_token_result, i); + for (j = 0; j < ctx->statfiles_ids->len; j++) { + id = g_array_index (ctx->statfiles_ids, gint, j); + st = g_ptr_array_index (ctx->ctx->statfiles, id); + g_assert (st != NULL); - if (res->st_runtime) { - if (!res->st_runtime->st->is_spam) { - res->value ++; + if (is_spam) { + if (st->stcf->is_spam) { + tok->values[id]++; + } + else if (tok->values[id] > 0) { + /* Unlearning */ + tok->values[id]--; + } } - else if (res->value > 0) { - res->value --; + else { + if (!st->stcf->is_spam) { + tok->values[id]++; + } + else if (tok->values[id] > 0) { + /* Unlearning */ + tok->values[id]--; + } } } } - return FALSE; -} - -gboolean -bayes_learn_spam (struct classifier_ctx * ctx, - GTree *input, - struct rspamd_classifier_runtime *rt, - struct rspamd_task *task, - gboolean is_spam, - GError **err) -{ - g_assert (ctx != NULL); - g_assert (input != NULL); - g_assert (rt != NULL); - g_assert (rt->end_pos > rt->start_pos); - - if (is_spam) { - g_tree_foreach (input, bayes_learn_spam_callback, rt); - } - else { - g_tree_foreach (input, bayes_learn_ham_callback, rt); - } - - return TRUE; } diff --git a/src/libstat/classifiers/classifiers.h b/src/libstat/classifiers/classifiers.h index 9a30039df..86395c96d 100644 --- a/src/libstat/classifiers/classifiers.h +++ b/src/libstat/classifiers/classifiers.h @@ -4,49 +4,40 @@ #include "config.h" #include "mem_pool.h" +#define RSPAMD_DEFAULT_CLASSIFIER "bayes" /* Consider this value as 0 */ #define ALPHA 0.0001 struct rspamd_classifier_config; struct rspamd_task; - -/* Common classifier structure */ -struct classifier_ctx { - rspamd_mempool_t *pool; - GHashTable *results; - gboolean debug; - struct rspamd_classifier_config *cfg; -}; +struct rspamd_classifier; struct token_node_s; -struct rspamd_classifier_runtime; struct rspamd_stat_classifier { char *name; - struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool, - struct rspamd_classifier_config *cf); - gboolean (*classify_func)(struct classifier_ctx * ctx, - GTree *input, struct rspamd_classifier_runtime *rt, - struct rspamd_task *task); - gboolean (*learn_spam_func)(struct classifier_ctx * ctx, - GTree *input, struct rspamd_classifier_runtime *rt, - struct rspamd_task *task, gboolean is_spam, - GError **err); + void (*init_func)(rspamd_mempool_t *pool, + struct rspamd_classifier *cl); + gboolean (*classify_func)(struct rspamd_classifier * ctx, + GPtrArray *tokens, + struct rspamd_task *task); + gboolean (*learn_spam_func)(struct rspamd_classifier * ctx, + GPtrArray *input, + struct rspamd_task *task, gboolean is_spam, + GError **err); }; /* Bayes algorithm */ -struct classifier_ctx * bayes_init (rspamd_mempool_t *pool, - struct rspamd_classifier_config *cf); -gboolean bayes_classify (struct classifier_ctx * ctx, - GTree *input, - struct rspamd_classifier_runtime *rt, - struct rspamd_task *task); -gboolean bayes_learn_spam (struct classifier_ctx * ctx, - GTree *input, - struct rspamd_classifier_runtime *rt, - struct rspamd_task *task, - gboolean is_spam, - GError **err); +void bayes_init (rspamd_mempool_t *pool, + struct rspamd_classifier *); +gboolean bayes_classify (struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task); +gboolean bayes_learn_spam (struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task, + gboolean is_spam, + GError **err); #endif /* diff --git a/src/libstat/learn_cache/learn_cache.h b/src/libstat/learn_cache/learn_cache.h index 273e906f5..84851649f 100644 --- a/src/libstat/learn_cache/learn_cache.h +++ b/src/libstat/learn_cache/learn_cache.h @@ -36,7 +36,8 @@ struct rspamd_config; struct rspamd_stat_cache { const char *name; - gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg); + gpointer (*init)(struct rspamd_stat_ctx *ctx, + struct rspamd_config *cfg, const ucl_object_t *cf); gint (*process)(struct rspamd_task *task, gboolean is_spam, gpointer ctx); @@ -45,7 +46,8 @@ struct rspamd_stat_cache { }; gpointer rspamd_stat_cache_sqlite3_init(struct rspamd_stat_ctx *ctx, - struct rspamd_config *cfg); + struct rspamd_config *cfg, + const ucl_object_t *cf); gint rspamd_stat_cache_sqlite3_process ( struct rspamd_task *task, gboolean is_spam, gpointer c); diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c index cf4ab615a..9590d00b5 100644 --- a/src/libstat/learn_cache/sqlite3_cache.c +++ b/src/libstat/learn_cache/sqlite3_cache.c @@ -121,71 +121,48 @@ struct rspamd_stat_sqlite3_ctx { gpointer rspamd_stat_cache_sqlite3_init(struct rspamd_stat_ctx *ctx, - struct rspamd_config *cfg) + struct rspamd_config *cfg, + const ucl_object_t *cf) { struct rspamd_stat_sqlite3_ctx *new = NULL; - struct rspamd_classifier_config *clf; - const ucl_object_t *obj, *elt; - GList *cur; + const ucl_object_t *elt; gchar dbpath[PATH_MAX]; + const gchar *path = SQLITE_CACHE_PATH; sqlite3 *sqlite; - gboolean has_sqlite_cache = FALSE; GError *err = NULL; - rspamd_snprintf (dbpath, sizeof (dbpath), SQLITE_CACHE_PATH); - cur = cfg->classifiers; - while (cur) { - clf = cur->data; + if (cf) { + elt = ucl_object_find_key (cf, "path"); - obj = ucl_object_find_key (clf->opts, "cache"); - - /* Sqlite3 cache is the default learn cache method */ - if (obj == NULL) { - has_sqlite_cache = TRUE; - break; + if (elt != NULL) { + path = ucl_object_tostring (elt); } - else if (ucl_object_type (obj) == UCL_OBJECT) { - elt = ucl_object_find_key (obj, "name"); + } - if (ucl_object_type (elt) == UCL_STRING && - g_ascii_strcasecmp (ucl_object_tostring (elt), "sqlite3") == 0) { + rspamd_snprintf (dbpath, sizeof (dbpath), "%s", path); - has_sqlite_cache = TRUE; - elt = ucl_object_find_key (obj, "path"); - if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { - rspamd_snprintf (dbpath, sizeof (dbpath), "%s", - ucl_object_tostring (elt)); - } - } - } + sqlite = rspamd_sqlite3_open_or_create (cfg->cfg_pool, + dbpath, create_tables_sql, &err); - cur = g_list_next (cur); + if (sqlite == NULL) { + msg_err ("cannot open sqlite3 cache: %e", err); + g_error_free (err); + err = NULL; } + else { + new = g_slice_alloc (sizeof (*new)); + new->db = sqlite; + new->prstmt = rspamd_sqlite3_init_prstmt (sqlite, prepared_stmts, + RSPAMD_STAT_CACHE_MAX, &err); - if (has_sqlite_cache) { - sqlite = rspamd_sqlite3_open_or_create (cfg->cfg_pool, - dbpath, create_tables_sql, &err); - - if (sqlite == NULL) { - msg_err_config ("cannot open sqlite3 cache: %e", err); + if (new->prstmt == NULL) { + msg_err ("cannot open sqlite3 cache: %e", err); g_error_free (err); err = NULL; - } - else { - new = g_slice_alloc (sizeof (*new)); - new->db = sqlite; - new->prstmt = rspamd_sqlite3_init_prstmt (sqlite, prepared_stmts, - RSPAMD_STAT_CACHE_MAX, &err); - - if (new->prstmt == NULL) { - msg_err_config ("cannot open sqlite3 cache: %e", err); - g_error_free (err); - err = NULL; - sqlite3_close (sqlite); - g_slice_free1 (sizeof (*new), new); - new = NULL; - } + sqlite3_close (sqlite); + g_slice_free1 (sizeof (*new), new); + new = NULL; } } diff --git a/src/libstat/stat_config.c b/src/libstat/stat_config.c index dbfe16c27..1f16a98de 100644 --- a/src/libstat/stat_config.c +++ b/src/libstat/stat_config.c @@ -44,18 +44,12 @@ static struct rspamd_stat_tokenizer stat_tokenizers[] = { { .name = "osb-text", .get_config = rspamd_tokenizer_osb_get_config, - .compatible_config = rspamd_tokenizer_osb_compatible_config, .tokenize_func = rspamd_tokenizer_osb, - .load_config = rspamd_tokenizer_osb_load_config, - .is_compat = rspamd_tokenizer_osb_is_compat }, { .name = "osb", .get_config = rspamd_tokenizer_osb_get_config, - .compatible_config = rspamd_tokenizer_osb_compatible_config, .tokenize_func = rspamd_tokenizer_osb, - .load_config = rspamd_tokenizer_osb_load_config, - .is_compat = rspamd_tokenizer_osb_is_compat }, }; @@ -63,9 +57,9 @@ static struct rspamd_stat_tokenizer stat_tokenizers[] = { .name = #nam, \ .init = rspamd_##eltn##_init, \ .runtime = rspamd_##eltn##_runtime, \ - .process_token = rspamd_##eltn##_process_token, \ + .process_tokens = rspamd_##eltn##_process_tokens, \ .finalize_process = rspamd_##eltn##_finalize_process, \ - .learn_token = rspamd_##eltn##_learn_token, \ + .learn_tokens = rspamd_##eltn##_learn_tokens, \ .finalize_learn = rspamd_##eltn##_finalize_learn, \ .total_learns = rspamd_##eltn##_total_learns, \ .inc_learns = rspamd_##eltn##_inc_learns, \ @@ -92,32 +86,106 @@ static struct rspamd_stat_cache stat_caches[] = { void rspamd_stat_init (struct rspamd_config *cfg) { - guint i; + GList *cur, *curst; + struct rspamd_classifier_config *clf; + struct rspamd_statfile_config *stf; + struct rspamd_stat_backend *bk; + struct rspamd_statfile *st; + struct rspamd_classifier *cl; + const ucl_object_t *cache_obj = NULL, *cache_name_obj; + const gchar *cache_name = NULL; if (stat_ctx == NULL) { stat_ctx = g_slice_alloc0 (sizeof (*stat_ctx)); } - stat_ctx->backends = stat_backends; + stat_ctx->backends_subrs = stat_backends; stat_ctx->backends_count = G_N_ELEMENTS (stat_backends); - stat_ctx->classifiers = stat_classifiers; + stat_ctx->classifiers_subrs = stat_classifiers; stat_ctx->classifiers_count = G_N_ELEMENTS (stat_classifiers); - stat_ctx->tokenizers = stat_tokenizers; + stat_ctx->tokenizers_subrs = stat_tokenizers; stat_ctx->tokenizers_count = G_N_ELEMENTS (stat_tokenizers); - stat_ctx->caches = stat_caches; + stat_ctx->caches_subrs = stat_caches; stat_ctx->caches_count = G_N_ELEMENTS (stat_caches); stat_ctx->cfg = cfg; + stat_ctx->statfiles = g_ptr_array_new (); + stat_ctx->classifiers = g_ptr_array_new (); + REF_RETAIN (stat_ctx->cfg); - /* Init backends */ - for (i = 0; i < stat_ctx->backends_count; i ++) { - stat_ctx->backends[i].ctx = stat_ctx->backends[i].init (stat_ctx, cfg); - msg_debug_config ("added backend %s", stat_ctx->backends[i].name); - } + /* Create statfiles from the classifiers */ + cur = cfg->classifiers; + + while (cur) { + clf = cur->data; + bk = rspamd_stat_get_backend (clf->backend); + g_assert (bk != NULL); + + /* XXX: + * Here we get the first classifier tokenizer config as the only one + * We NO LONGER support multiple tokenizers per rspamd instance + */ + if (stat_ctx->tkcf == NULL) { + stat_ctx->tokenizer = rspamd_stat_get_tokenizer (clf->tokenizer->name); + g_assert (stat_ctx->tokenizer != NULL); + stat_ctx->tkcf = stat_ctx->tokenizer->get_config (cfg->cfg_pool, + clf->tokenizer, NULL); + } + + cl = g_slice_alloc0 (sizeof (*cl)); + cl->cfg = clf; + cl->ctx = stat_ctx; + cl->statfiles_ids = g_array_new (FALSE, FALSE, sizeof (gint)); + cl->subrs = rspamd_stat_get_classifier (clf->classifier); + g_assert (cl->subrs != NULL); + cl->subrs->init_func (cfg->cfg_pool, cl); + + /* Init classifier cache */ + if (clf->opts) { + cache_obj = ucl_object_find_key (clf->opts, "cache"); + + if (cache_obj) { + cache_name_obj = ucl_object_find_key (cache_obj, "name"); + } + + if (cache_name_obj) { + cache_name = ucl_object_tostring (cache_name_obj); + } + } + + cl->cache = rspamd_stat_get_cache (cache_name); + g_assert (cl->cache != NULL); + cl->cachecf = cl->cache->init (stat_ctx, cfg, cache_obj); + + curst = clf->statfiles; + + while (curst) { + stf = curst->data; + st = g_slice_alloc0 (sizeof (*st)); + st->classifier = cl; + st->stcf = stf; + st->backend = bk; + st->bkcf = bk->init (stat_ctx, cfg, st); + msg_debug_config ("added backend %s for symbol %s", + bk->name, stf->symbol); + + if (st->bkcf == NULL) { + msg_err_config ("cannot init backend %s for statfile %s", + clf->backend, stf->symbol); + + g_slice_free1 (sizeof (*st), st); + } + else { + st->id = stat_ctx->statfiles->len; + g_ptr_array_add (stat_ctx->statfiles, st); + g_array_append_val (cl->statfiles_ids, st->id); + } + + curst = curst->next; + } + + g_ptr_array_add (stat_ctx->classifiers, cl); - /* Init caches */ - for (i = 0; i < stat_ctx->caches_count; i ++) { - stat_ctx->caches[i].ctx = stat_ctx->caches[i].init (stat_ctx, cfg); - msg_debug_config ("added cache %s", stat_ctx->caches[i].name); + cur = cur->next; } } @@ -129,12 +197,9 @@ rspamd_stat_close (void) g_assert (stat_ctx != NULL); - for (i = 0; i < stat_ctx->backends_count; i ++) { - if (stat_ctx->backends[i].close != NULL) { - stat_ctx->backends[i].close (stat_ctx->backends[i].ctx); - msg_debug_config ("closed backend %s", stat_ctx->backends[i].name); - } - } + /* TODO: add cleanup routine */ + + REF_RELEASE (stat_ctx->cfg); } struct rspamd_stat_ctx * @@ -148,9 +213,13 @@ rspamd_stat_get_classifier (const gchar *name) { guint i; + if (name == NULL || name[0] == '\0') { + name = RSPAMD_DEFAULT_CLASSIFIER; + } + for (i = 0; i < stat_ctx->classifiers_count; i ++) { - if (strcmp (name, stat_ctx->classifiers[i].name) == 0) { - return &stat_ctx->classifiers[i]; + if (strcmp (name, stat_ctx->classifiers_subrs[i].name) == 0) { + return &stat_ctx->classifiers_subrs[i]; } } @@ -167,8 +236,8 @@ rspamd_stat_get_backend (const gchar *name) } for (i = 0; i < stat_ctx->backends_count; i ++) { - if (strcmp (name, stat_ctx->backends[i].name) == 0) { - return &stat_ctx->backends[i]; + if (strcmp (name, stat_ctx->backends_subrs[i].name) == 0) { + return &stat_ctx->backends_subrs[i]; } } @@ -185,8 +254,26 @@ rspamd_stat_get_tokenizer (const gchar *name) } for (i = 0; i < stat_ctx->tokenizers_count; i ++) { - if (strcmp (name, stat_ctx->tokenizers[i].name) == 0) { - return &stat_ctx->tokenizers[i]; + if (strcmp (name, stat_ctx->tokenizers_subrs[i].name) == 0) { + return &stat_ctx->tokenizers_subrs[i]; + } + } + + return NULL; +} + +struct rspamd_stat_cache * +rspamd_stat_get_cache (const gchar *name) +{ + guint i; + + if (name == NULL || name[0] == '\0') { + name = RSPAMD_DEFAULT_CACHE; + } + + for (i = 0; i < stat_ctx->caches_count; i++) { + if (strcmp (name, stat_ctx->caches_subrs[i].name) == 0) { + return &stat_ctx->caches_subrs[i]; } } diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h index 640196788..591c47381 100644 --- a/src/libstat/stat_internal.h +++ b/src/libstat/stat_internal.h @@ -30,20 +30,6 @@ #include "backends/backends.h" #include "learn_cache/learn_cache.h" -enum stat_process_stage { - RSPAMD_STAT_STAGE_PRE = 0, - RSPAMD_STAT_STAGE_POST -}; - -struct rspamd_tokenizer_runtime { - GTree *tokens; - const gchar *name; - struct rspamd_stat_tokenizer *tokenizer; - struct rspamd_tokenizer_config *tkcf; - gpointer config; - gsize conf_len; -}; - struct rspamd_statfile_runtime { struct rspamd_statfile_config *st; gpointer backend_runtime; @@ -51,28 +37,24 @@ struct rspamd_statfile_runtime { guint64 total_hits; }; -struct rspamd_classifier_runtime { - struct rspamd_classifier_config *clcf; - struct classifier_ctx *clctx; - struct rspamd_stat_classifier *cl; - struct rspamd_stat_backend *backend; - struct rspamd_tokenizer_runtime *tok; - double ham_prob; - double spam_prob; - enum stat_process_stage stage; - guint64 total_spam; - guint64 total_ham; - guint64 processed_tokens; - GList *st_runtime; - guint start_pos; - guint end_pos; - gboolean skipped; +/* Common classifier structure */ +struct rspamd_classifier { + struct rspamd_stat_ctx *ctx; + struct rspamd_stat_cache *cache; + gpointer cachecf; + GArray *statfiles_ids; + gulong spam_learns; + gulong ham_learns; + struct rspamd_classifier_config *cfg; + struct rspamd_stat_classifier *subrs; }; -struct rspamd_token_result { - double value; - struct rspamd_statfile_runtime *st_runtime; - struct rspamd_classifier_runtime *cl_runtime; +struct rspamd_statfile { + gint id; + struct rspamd_statfile_config *stcf; + struct rspamd_classifier *classifier; + struct rspamd_stat_backend *backend; + gpointer bkcf; }; #define RSPAMD_MAX_TOKEN_LEN 16 @@ -80,21 +62,27 @@ typedef struct token_node_s { guchar data[RSPAMD_MAX_TOKEN_LEN]; guint window_idx; guint datalen; - GArray *results; + gdouble values[]; } rspamd_token_t; struct rspamd_stat_ctx { - struct rspamd_stat_classifier *classifiers; + /* Subroutines for all objects */ + struct rspamd_stat_classifier *classifiers_subrs; guint classifiers_count; - struct rspamd_stat_tokenizer *tokenizers; + struct rspamd_stat_tokenizer *tokenizers_subrs; guint tokenizers_count; - struct rspamd_stat_backend *backends; + struct rspamd_stat_backend *backends_subrs; guint backends_count; - struct rspamd_stat_cache *caches; + struct rspamd_stat_cache *caches_subrs; guint caches_count; - guint statfiles; + /* Runtime configuration */ + GPtrArray *statfiles; /* struct rspamd_statfile */ + GPtrArray *classifiers; /* struct rspamd_classifier */ struct rspamd_config *cfg; + /* Global tokenizer */ + struct rspamd_stat_tokenizer *tokenizer; + gpointer tkcf; }; typedef enum rspamd_learn_cache_result { @@ -107,6 +95,7 @@ struct rspamd_stat_ctx * rspamd_stat_get_ctx (void); struct rspamd_stat_classifier * rspamd_stat_get_classifier (const gchar *name); struct rspamd_stat_backend * rspamd_stat_get_backend (const gchar *name); struct rspamd_stat_tokenizer * rspamd_stat_get_tokenizer (const gchar *name); +struct rspamd_stat_cache * rspamd_stat_get_cache (const gchar *name); static GQuark rspamd_stat_quark (void) { diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 8bdf394b1..8a4269727 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2015, Vsevolod Stakhov +/* Copyright (c) 2015-2016, Vsevolod Stakhov * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,18 +37,8 @@ static const gint similarity_treshold = 80; -struct preprocess_cb_data { - struct rspamd_task *task; - GList *classifier_runtimes; - struct rspamd_tokenizer_runtime *tok; - guint results_count; - gboolean unlearn; - gboolean spam; -}; - static void rspamd_stat_tokenize_header (struct rspamd_task *task, - struct rspamd_tokenizer_runtime *tok, const gchar *name, const gchar *prefix, GArray *ar) { struct raw_header *rh, *cur; @@ -81,8 +71,8 @@ rspamd_stat_tokenize_header (struct rspamd_task *task, } static void -rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task, - struct rspamd_tokenizer_runtime *tok) +rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) { struct rspamd_image *img; struct mime_part *part; @@ -164,16 +154,17 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task, cur = g_list_first (task->cfg->classify_headers); while (cur) { - rspamd_stat_tokenize_header (task, tok, cur->data, "UA:", ar); + rspamd_stat_tokenize_header (task, cur->data, "UA:", ar); cur = g_list_next (cur); } - tok->tokenizer->tokenize_func (tok, + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, ar, TRUE, - "META:"); + "META:", + task->tokens); g_array_free (ar, TRUE); } @@ -183,24 +174,36 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task, */ static void rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, - struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok) + struct rspamd_task *task) { struct mime_text_part *part; GArray *words; gchar *sub; - guint i; + guint i, reserved_len = 0; gint *pdiff; - gboolean compat; - compat = tok->tokenizer->is_compat (tok); + for (i = 0; i < task->text_parts->len; i++) { + part = g_ptr_array_index (task->text_parts, i); + + if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { + reserved_len += part->normalized_words->len; + } + /* XXX: normal window size */ + reserved_len += 5; + } + + task->tokens = g_ptr_array_sized_new (reserved_len); + rspamd_mempool_add_destructor (task->task_pool, + rspamd_ptr_array_free_hard, task->tokens); pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance"); for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { - tok->tokenizer->tokenize_func (tok, task->task_pool, - part->normalized_words, IS_PART_UTF (part), NULL); + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, + part->normalized_words, IS_PART_UTF (part), + NULL, task->tokens); } @@ -219,324 +222,118 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat, + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, NULL); if (words != NULL) { - tok->tokenizer->tokenize_func (tok, + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, words, TRUE, - "SUBJECT"); + "SUBJECT", + task->tokens); g_array_free (words, TRUE); } } - rspamd_stat_tokenize_parts_metadata (task, tok); + rspamd_stat_tokenize_parts_metadata (st_ctx, task); } -static struct rspamd_tokenizer_runtime * -rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf, - struct rspamd_stat_ctx *st_ctx, - struct rspamd_task *task, - struct rspamd_classifier_runtime *cl_runtime, - gpointer conf, gsize conf_len) +static void +rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task, gboolean learn) { - struct rspamd_tokenizer_runtime *tok = NULL; - const gchar *name; - - if (cf == NULL || cf->name == NULL) { - name = RSPAMD_DEFAULT_TOKENIZER; - cf->name = name; - } - else { - name = cf->name; - } - - tok = rspamd_mempool_alloc (task->task_pool, sizeof (*tok)); - tok->tokenizer = rspamd_stat_get_tokenizer (name); - tok->tkcf = cf; - - if (tok->tokenizer == NULL) { - return NULL; - } - - if (!tok->tokenizer->load_config (task->task_pool, tok, conf, conf_len)) { - return NULL; - } + guint i; + struct rspamd_statfile *st; + gpointer bk_run; - tok->tokens = g_tree_new (token_node_compare_func); + rspamd_stat_process_tokenize (st_ctx, task); + task->stat_runtimes = g_ptr_array_sized_new (st_ctx->statfiles->len); rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens); - tok->name = name; - rspamd_stat_process_tokenize (st_ctx, task, tok); - cl_runtime->tok = tok; - - return tok; -} - -static gboolean -preprocess_init_stat_token (gpointer k, gpointer v, gpointer d) -{ - rspamd_token_t *t = (rspamd_token_t *)v; - struct preprocess_cb_data *cbdata = (struct preprocess_cb_data *)d; - struct rspamd_statfile_runtime *st_runtime; - struct rspamd_classifier_runtime *cl_runtime; - struct rspamd_token_result *res; - GList *cur, *curst; - struct rspamd_task *task; - gint i = 0; - - task = cbdata->task; - t->results = g_array_sized_new (FALSE, TRUE, - sizeof (struct rspamd_token_result), cbdata->results_count); - g_array_set_size (t->results, cbdata->results_count); - rspamd_mempool_add_destructor (cbdata->task->task_pool, - rspamd_array_free_hard, t->results); - - cur = g_list_first (cbdata->classifier_runtimes); - - while (cur) { - cl_runtime = (struct rspamd_classifier_runtime *)cur->data; - - if (cl_runtime->clcf->min_tokens > 0 && - (guint32)g_tree_nnodes (cbdata->tok->tokens) < cl_runtime->clcf->min_tokens) { - /* Skip this classifier */ - cur = g_list_next (cur); - cl_runtime->skipped = TRUE; - continue; - } + rspamd_ptr_array_free_hard, task->stat_runtimes); - curst = cl_runtime->st_runtime; - - while (curst) { + for (i = 0; i < st_ctx->statfiles->len; i ++) { + st = g_ptr_array_index (st_ctx->statfiles, i); + g_assert (st != NULL); - st_runtime = (struct rspamd_statfile_runtime *)curst->data; - res = &g_array_index (t->results, struct rspamd_token_result, i); - res->cl_runtime = cl_runtime; - res->st_runtime = st_runtime; - - if (cl_runtime->backend->process_token (cbdata->task, t, res, - cl_runtime->backend->ctx)) { + bk_run = st->backend->runtime (task, st->stcf, learn, st->bkcf); - if (cl_runtime->clcf->max_tokens > 0 && - cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) { - msg_debug_task ("message contains more tokens than allowed for %s classifier: " - "%uL > %ud", cl_runtime->clcf->name, - cl_runtime->processed_tokens, - cl_runtime->clcf->max_tokens); - - return TRUE; - } - } - - i ++; - curst = g_list_next (curst); + if (bk_run == NULL) { + msg_err_task ("cannot init backend %s for statfile %s", + st->backend->name, st->stcf->symbol); } - cur = g_list_next (cur); + g_ptr_array_add (task->stat_runtimes, bk_run); } - - - return FALSE; } -static GList* -rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, - struct rspamd_task *task, - lua_State *L, - gint op, - gboolean spam, - const gchar *classifier, - GError **err) +static void +rspamd_stat_backends_process (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) { - struct rspamd_classifier_config *clcf; - struct rspamd_statfile_config *stcf; - struct rspamd_classifier_runtime *cl_runtime; - struct rspamd_statfile_runtime *st_runtime; - struct rspamd_stat_backend *bk; - gpointer backend_runtime, tok_config; - GList *cur, *st_list = NULL, *curst; - GList *cl_runtimes = NULL; - guint result_size = 0, start_pos = 0, end_pos = 0; - gsize conf_len; - struct preprocess_cb_data cbdata; - - cur = g_list_first (task->cfg->classifiers); - - while (cur) { - clcf = (struct rspamd_classifier_config *)cur->data; - st_list = NULL; - - if (classifier != NULL && - (clcf->name == NULL || strcmp (clcf->name, classifier) != 0)) { - /* Skip this classifier */ - msg_debug_task ("skip classifier %s, as we are requested to check %s only", - clcf->name, classifier); - cur = g_list_next (cur); - continue; - } - - if (clcf->pre_callbacks != NULL) { - st_list = rspamd_lua_call_cls_pre_callbacks (clcf, task, FALSE, - FALSE, L); - } - if (st_list != NULL) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, st_list); - } - else { - st_list = clcf->statfiles; - } - - /* Now init runtime values */ - cl_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cl_runtime)); - cl_runtime->cl = rspamd_stat_get_classifier (clcf->classifier); - - if (cl_runtime->cl == NULL) { - g_set_error (err, rspamd_stat_quark(), 500, - "classifier %s is not defined", clcf->classifier); - g_list_free (cl_runtimes); - return NULL; - } - - cl_runtime->clcf = clcf; - - bk = rspamd_stat_get_backend (clcf->backend); - if (bk == NULL) { - g_set_error (err, rspamd_stat_quark(), 500, - "backend %s is not defined", clcf->backend); - g_list_free (cl_runtimes); - return NULL; - } - - cl_runtime->backend = bk; - - curst = st_list; - while (curst != NULL) { - stcf = (struct rspamd_statfile_config *)curst->data; - - /* On learning skip statfiles that do not belong to class */ - if (op == RSPAMD_LEARN_OP && (spam != stcf->is_spam)) { - curst = g_list_next (curst); - continue; - } - - backend_runtime = bk->runtime (task, stcf, op != RSPAMD_CLASSIFY_OP, - bk->ctx); - - if (backend_runtime == NULL) { - if (op != RSPAMD_CLASSIFY_OP) { - /* Assume backend absence as fatal error */ - g_set_error (err, rspamd_stat_quark(), 500, - "cannot open backend for statfile %s", stcf->symbol); - g_list_free (cl_runtimes); - - return NULL; - } - else { - /* Just skip this element */ - msg_warn ("backend of type %s does not exist: %s", - clcf->backend, stcf->symbol); - curst = g_list_next (curst); - continue; - } - } - - tok_config = bk->load_tokenizer_config (backend_runtime, - &conf_len); - - if (cl_runtime->tok == NULL) { - cl_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, - st_ctx, task, cl_runtime, tok_config, conf_len); - - if (cl_runtime->tok == NULL) { - g_set_error (err, rspamd_stat_quark(), 500, - "cannot initialize tokenizer for statfile %s", stcf->symbol); - g_list_free (cl_runtimes); - - return NULL; - } - } + guint i; + struct rspamd_statfile *st; + struct rspamd_classifier *cl; + gpointer bk_run; - if (!cl_runtime->tok->tokenizer->compatible_config ( - cl_runtime->tok, tok_config, conf_len)) { - g_set_error (err, rspamd_stat_quark(), 500, - "incompatible tokenizer for statfile %s", stcf->symbol); - g_list_free (cl_runtimes); + g_assert (task->stat_runtimes != NULL); - return NULL; - } + for (i = 0; i < st_ctx->statfiles->len; i++) { + st = g_ptr_array_index (st_ctx->statfiles, i); + bk_run = g_ptr_array_index (task->stat_runtimes, i); + cl = st->classifier; + g_assert (st != NULL); - st_runtime = rspamd_mempool_alloc0 (task->task_pool, - sizeof (*st_runtime)); - st_runtime->st = stcf; - st_runtime->backend_runtime = backend_runtime; + if (bk_run != NULL) { + st->backend->process_tokens (task, task->tokens, i, bk_run); - if (stcf->is_spam) { - cl_runtime->total_spam += bk->total_learns (task, backend_runtime, - bk->ctx); + if (st->stcf->is_spam) { + cl->spam_learns = st->backend->total_learns (task, + bk_run, + st_ctx); } else { - cl_runtime->total_ham += bk->total_learns (task, backend_runtime, - bk->ctx); + cl->ham_learns = st->backend->total_learns (task, + bk_run, + st_ctx); } - - cl_runtime->st_runtime = g_list_prepend (cl_runtime->st_runtime, - st_runtime); - result_size ++; - - curst = g_list_next (curst); - end_pos ++; - } - - if (cl_runtime->st_runtime != NULL) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, - cl_runtime->st_runtime); - cl_runtimes = g_list_prepend (cl_runtimes, cl_runtime); } + } +} - /* Set positions in the results array */ - cl_runtime->start_pos = start_pos; - cl_runtime->end_pos = end_pos; +static void +rspamd_stat_backends_post_process (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) +{ + guint i; + struct rspamd_statfile *st; + gpointer bk_run; - msg_debug_task ("added runtime for %s classifier from %ud to %ud", - clcf->name, start_pos, end_pos); + g_assert (task->stat_runtimes != NULL); - start_pos = end_pos; + for (i = 0; i < st_ctx->statfiles->len; i++) { + st = g_ptr_array_index (st_ctx->statfiles, i); + bk_run = g_ptr_array_index (task->stat_runtimes, i); + g_assert (st != NULL); - /* Next classifier */ - cur = g_list_next (cur); + if (bk_run != NULL) { + st->backend->finalize_process (task, bk_run, st_ctx); + } } +} - if (cl_runtimes != NULL) { - /* Reverse list as we have used g_list_prepend */ - cl_runtimes = g_list_reverse (cl_runtimes); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_list_free, - cl_runtimes); - cur = g_list_first (cl_runtimes); - - while (cur) { - cl_runtime = cur->data; +static void +rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) +{ + guint i; + struct rspamd_classifier *cl; - cbdata.results_count = result_size; - cbdata.classifier_runtimes = cl_runtimes; - cbdata.task = task; - cbdata.tok = cl_runtime->tok; - g_tree_foreach (cbdata.tok->tokens, preprocess_init_stat_token, - &cbdata); + for (i = 0; i < st_ctx->classifiers->len; i++) { + cl = g_ptr_array_index (st_ctx->classifiers, i); + g_assert (cl != NULL); - cur = g_list_next (cur); - } - } - else if (classifier != NULL) { - /* We likely cannot find any classifier with this name */ - g_set_error (err, rspamd_stat_quark (), 404, - "cannot find classifier %s", classifier); + cl->subrs->classify_func (cl, task->tokens, task); } - - return cl_runtimes; } rspamd_stat_result_t @@ -544,102 +341,30 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, guint stage, GError **err) { struct rspamd_stat_ctx *st_ctx; - struct rspamd_statfile_runtime *st_run; - struct rspamd_classifier_runtime *cl_run; - GList *cl_runtimes; - GList *cur, *curst; - gboolean ret = RSPAMD_STAT_PROCESS_OK; + rspamd_stat_result_t ret = RSPAMD_STAT_PROCESS_OK; st_ctx = rspamd_stat_get_ctx (); g_assert (st_ctx != NULL); - cl_runtimes = task->cl_runtimes; - if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) { - /* Initialize classifiers and statfiles runtime */ - if (task->cl_runtimes == NULL) { - if ((cl_runtimes = rspamd_stat_preprocess (st_ctx, task, L, - RSPAMD_CLASSIFY_OP, FALSE, NULL, err)) == NULL) { - return RSPAMD_STAT_PROCESS_OK; - } - - task->cl_runtimes = cl_runtimes; - cur = cl_runtimes; - /* Finalize backend so it can load tokens delayed if needed */ - while (cur) { - cl_run = (struct rspamd_classifier_runtime *) cur->data; - curst = cl_run->st_runtime; - - while (curst) { - st_run = curst->data; - cl_run->backend->finalize_process (task, - st_run->backend_runtime, - cl_run->backend->ctx); - curst = g_list_next (curst); - } - - cur = g_list_next (cur); - } - } + if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) { + /* Preprocess tokens */ + rspamd_stat_preprocess (st_ctx, task, FALSE); } else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS) { - cur = cl_runtimes; - - /* The first stage of classification */ - while (cur) { - cl_run = (struct rspamd_classifier_runtime *) cur->data; - cl_run->stage = RSPAMD_STAT_STAGE_PRE; - - if (cl_run->cl) { - cl_run->clctx = cl_run->cl->init_func (task->task_pool, - cl_run->clcf); - - if (cl_run->clctx != NULL) { - cl_run->cl->classify_func (cl_run->clctx, cl_run->tok->tokens, - cl_run, task); - } - } - - cur = g_list_next (cur); - } + /* Process backends */ + rspamd_stat_backends_process (st_ctx, task); } else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_POST) { - cur = cl_runtimes; - /* The second stage of classification */ - while (cur) { - cl_run = (struct rspamd_classifier_runtime *) cur->data; - cl_run->stage = RSPAMD_STAT_STAGE_POST; - - if (cl_run->skipped) { - cur = g_list_next (cur); - continue; - } - - cl_run = (struct rspamd_classifier_runtime *) cur->data; - cl_run->stage = RSPAMD_STAT_STAGE_POST; - - if (cl_run->skipped) { - cur = g_list_next (cur); - continue; - } - - if (cl_run->cl) { - if (cl_run->clctx != NULL) { - if (cl_run->cl->classify_func (cl_run->clctx, - cl_run->tok->tokens, - cl_run, task)) { - ret = RSPAMD_STAT_PROCESS_OK; - } - } - } - - cur = g_list_next (cur); - } + /* Process classifiers */ + rspamd_stat_backends_post_process (st_ctx, task); + rspamd_stat_classifiers_process (st_ctx, task); } return ret; } +#if 0 static gboolean rspamd_stat_learn_token (gpointer k, gpointer v, gpointer d) { @@ -910,3 +635,26 @@ rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task, return RSPAMD_STAT_PROCESS_OK; } +#else +/* TODO: finish learning */ +rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task, + gboolean spam, lua_State *L, const gchar *classifier, + GError **err) +{ + return RSPAMD_STAT_PROCESS_ERROR; +} + +/** + * Get the overall statistics for all statfile backends + * @param cfg configuration + * @param total_learns the total number of learns is stored here + * @return array of statistical information + */ +rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task, + struct rspamd_config *cfg, + guint64 *total_learns, + ucl_object_t **res) +{ + return RSPAMD_STAT_PROCESS_ERROR; +} +#endif diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 2d1b3bb3e..55a0c6bba 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -189,6 +189,7 @@ rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, return osb_cf; } +#if 0 gboolean rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len) @@ -223,28 +224,68 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, return ret; } +gboolean +rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf; + + if (ptr == NULL || len == 0) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); + + if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) { + /* Trying to load incompatible configuration */ + msg_err_pool ("cannot load tokenizer configuration from a legacy " + "statfile; maybe you have forgotten to set 'compat' option" + " in the tokenizer configuration"); + + return FALSE; + } + } + else { + g_assert (len == sizeof (*osb_cf)); + osb_cf = ptr; + } + + rt->config = osb_cf; + rt->conf_len = sizeof (*osb_cf); + + return TRUE; +} + +gboolean +rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) +{ + struct rspamd_osb_tokenizer_config *osb_cf = rt->config; + + return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); +} +#endif + + + gint -rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, - rspamd_mempool_t * pool, - GArray * input, - gboolean is_utf, - const gchar *prefix) +rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, + rspamd_mempool_t *pool, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result) { - rspamd_token_t *new = NULL; + rspamd_token_t *new_tok = NULL; rspamd_ftok_t *token; struct rspamd_osb_tokenizer_config *osb_cf; guint64 *hashpipe, cur, seed; guint32 h1, h2; + gsize token_size; guint processed = 0, i, w, window_size; - GTree *tree = rt->tokens; - - g_assert (tree != NULL); - if (input == NULL) { + if (words == NULL) { return FALSE; } - osb_cf = rt->config; + osb_cf = ctx->tkcf; window_size = osb_cf->window_size; if (prefix) { @@ -256,9 +297,11 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0])); + token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len; + g_assert (token_size > 0); - for (w = 0; w < input->len; w ++) { - token = &g_array_index (input, rspamd_ftok_t, w); + for (w = 0; w < words->len; w ++) { + token = &g_array_index (words, rspamd_ftok_t, w); if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { cur = rspamd_fstrhash_lc (token, is_utf); @@ -278,6 +321,25 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, } } +#define ADD_TOKEN do {\ + new_tok = rspamd_mempool_alloc0 (pool, token_size); \ + new_tok->datalen = sizeof (gint64); \ + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ + h1 = ((guint32)hashpipe[0]) * primes[0] + \ + ((guint32)hashpipe[i]) * primes[i << 1]; \ + h2 = ((guint32)hashpipe[0]) * primes[1] + \ + ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; \ + memcpy(new_tok->data, &h1, sizeof (h1)); \ + memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \ + } \ + else { \ + cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; \ + memcpy (new_tok->data, &cur, sizeof (cur)); \ + } \ + new_tok->window_idx = i + 1; \ + g_ptr_array_add (result, new_tok); \ + } while(0) + if (processed < window_size) { /* Just fill a hashpipe */ hashpipe[window_size - ++processed] = cur; @@ -291,97 +353,20 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, processed++; for (i = 1; i < window_size; i++) { - new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); - new->datalen = sizeof (gint64); - - if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - h1 = ((guint32)hashpipe[0]) * primes[0] + - ((guint32)hashpipe[i]) * primes[i << 1]; - h2 = ((guint32)hashpipe[0]) * primes[1] + - ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; - - memcpy(new->data, &h1, sizeof (h1)); - memcpy(new->data + sizeof (h1), &h2, sizeof (h2)); - } - else { - cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - memcpy (new->data, &cur, sizeof (cur)); - } - - new->window_idx = i + 1; - - if (g_tree_lookup (tree, new) == NULL) { - g_tree_insert (tree, new, new); - } + ADD_TOKEN; } } } if (processed <= window_size) { memmove (hashpipe, hashpipe + (window_size - processed + 1), processed); - for (i = 1; i < processed; i++) { - new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); - new->datalen = sizeof (gint64); - - if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - h1 = ((guint32)hashpipe[0]) * primes[0] + - ((guint32)hashpipe[i]) * primes[i << 1]; - h2 = ((guint32)hashpipe[0]) * primes[1] + - ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; - memcpy(new->data, &h1, sizeof (h1)); - memcpy(new->data + sizeof (h1), &h2, sizeof (h2)); - } - else { - cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - memcpy (new->data, &cur, sizeof (cur)); - } - - new->window_idx = i + 1; - - if (g_tree_lookup (tree, new) == NULL) { - g_tree_insert (tree, new, new); - } - } - } - - return TRUE; -} - - -gboolean -rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len) -{ - struct rspamd_osb_tokenizer_config *osb_cf; - if (ptr == NULL || len == 0) { - osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); - - if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) { - /* Trying to load incompatible configuration */ - msg_err_pool ("cannot load tokenizer configuration from a legacy " - "statfile; maybe you have forgotten to set 'compat' option" - " in the tokenizer configuration"); - - return FALSE; + for (i = 1; i < processed; i++) { + ADD_TOKEN; } } - else { - g_assert (len == sizeof (*osb_cf)); - osb_cf = ptr; - } - rt->config = osb_cf; - rt->conf_len = sizeof (*osb_cf); +#undef ADD_TOKEN return TRUE; } - -gboolean -rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) -{ - struct rspamd_osb_tokenizer_config *osb_cf = rt->config; - - return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); -} diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index f4c9a5ed3..70ff7560c 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -10,23 +10,19 @@ #define RSPAMD_DEFAULT_TOKENIZER "osb" struct rspamd_tokenizer_runtime; +struct rspamd_stat_ctx; /* Common tokenizer structure */ struct rspamd_stat_tokenizer { gchar *name; gpointer (*get_config) (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); - gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - gboolean (*load_config) (rspamd_mempool_t *pool, - struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - gboolean (*is_compat) (struct rspamd_tokenizer_runtime *rt); - gint (*tokenize_func)(struct rspamd_tokenizer_runtime *rt, + gint (*tokenize_func)(struct rspamd_stat_ctx *ctx, rspamd_mempool_t *pool, GArray *words, gboolean is_utf, - const gchar *prefix); + const gchar *prefix, + GPtrArray *result); }; /* Compare two token nodes */ @@ -39,28 +35,17 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, guint64 *hash); /* OSB tokenize function */ -gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, - rspamd_mempool_t *pool, - GArray *input, - gboolean is_utf, - const gchar *prefix); +gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, + rspamd_mempool_t *pool, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); -gboolean -rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - -gboolean -rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - -gboolean -rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt); - #endif /* * vi:ts=4 |