summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-01-05 19:04:40 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-01-05 19:04:40 +0000
commit1622570f58b5f5b184f97cd75a52a98cc0b1721a (patch)
tree3510b622bcc91644234a9e9a25825d3f7c1b1de6 /src
parent57a464ab523700fc7f2ab3f116724cd198799da8 (diff)
parent29b7115762ad84865b6b657c8f5e88aba16e8eb4 (diff)
downloadrspamd-1622570f58b5f5b184f97cd75a52a98cc0b1721a.tar.gz
rspamd-1622570f58b5f5b184f97cd75a52a98cc0b1721a.zip
Merge branch 'stat-rework'
Diffstat (limited to 'src')
-rw-r--r--src/libserver/task.h5
-rw-r--r--src/libstat/CMakeLists.txt12
-rw-r--r--src/libstat/backends/backends.h30
-rw-r--r--src/libstat/backends/mmaped_file.c323
-rw-r--r--src/libstat/backends/sqlite3_backend.c430
-rw-r--r--src/libstat/classifiers/bayes.c313
-rw-r--r--src/libstat/classifiers/classifiers.h51
-rw-r--r--src/libstat/learn_cache/learn_cache.h6
-rw-r--r--src/libstat/learn_cache/sqlite3_cache.c75
-rw-r--r--src/libstat/stat_config.c155
-rw-r--r--src/libstat/stat_internal.h69
-rw-r--r--src/libstat/stat_process.c512
-rw-r--r--src/libstat/tokenizers/osb.c173
-rw-r--r--src/libstat/tokenizers/tokenizers.h35
14 files changed, 854 insertions, 1335 deletions
diff --git a/src/libserver/task.h b/src/libserver/task.h
index 359b2f41f..ed18d99d0 100644
--- a/src/libserver/task.h
+++ b/src/libserver/task.h
@@ -149,8 +149,7 @@ struct rspamd_task {
GHashTable *raw_headers; /**< list of raw headers */
GHashTable *results; /**< hash table of metric_result indexed by
* metric's name */
- GHashTable *tokens; /**< hash table of tokens indexed by tokenizer
- * pointer */
+ GPtrArray *tokens; /**< statistics tokens */
InternetAddressList *rcpt_mime; /**< list of all recipients */
InternetAddressList *rcpt_envelope; /**< list of all recipients */
InternetAddressList *from_mime;
@@ -158,7 +157,7 @@ struct rspamd_task {
GList *messages; /**< list of messages that would be reported */
struct rspamd_re_runtime *re_rt; /**< regexp runtime */
- GList *cl_runtimes; /**< classifiers runtime */
+ GPtrArray *stat_runtimes; /**< backend runtime */
struct rspamd_config *cfg; /**< pointer to config object */
GError *err;
rspamd_mempool_t *task_pool; /**< memory pool for task */
diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt
index 2fa16cf98..80abf9d6f 100644
--- a/src/libstat/CMakeLists.txt
+++ b/src/libstat/CMakeLists.txt
@@ -10,14 +10,14 @@ SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c)
SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c
${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c)
IF(ENABLE_HIREDIS MATCHES "ON")
- SET(BACKENDSSRC ${BACKENDSSRC}
- ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis.c)
+ #SET(BACKENDSSRC ${BACKENDSSRC}
+ # ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis.c)
ENDIF(ENABLE_HIREDIS MATCHES "ON")
SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c)
-
-SET(RSPAMD_STAT ${LIBSTATSRC}
- ${TOKENIZERSSRC}
- ${CLASSIFIERSSRC}
+
+SET(RSPAMD_STAT ${LIBSTATSRC}
+ ${TOKENIZERSSRC}
+ ${CLASSIFIERSSRC}
${BACKENDSSRC}
${CACHESSRC} PARENT_SCOPE)
diff --git a/src/libstat/backends/backends.h b/src/libstat/backends/backends.h
index 4ac59655c..44c2f13ed 100644
--- a/src/libstat/backends/backends.h
+++ b/src/libstat/backends/backends.h
@@ -36,21 +36,23 @@ struct rspamd_statfile_config;
struct rspamd_config;
struct rspamd_stat_ctx;
struct rspamd_token_result;
-struct rspamd_statfile_runtime;
-struct token_node_s;
+struct rspamd_statfile;
struct rspamd_task;
struct rspamd_stat_backend {
const char *name;
- gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg);
+ gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg,
+ struct rspamd_statfile *st);
gpointer (*runtime)(struct rspamd_task *task,
struct rspamd_statfile_config *stcf, gboolean learn, gpointer ctx);
- gboolean (*process_token)(struct rspamd_task *task, struct token_node_s *tok,
- struct rspamd_token_result *res, gpointer ctx);
+ gboolean (*process_tokens)(struct rspamd_task *task, GPtrArray *tokens,
+ gint id,
+ gpointer ctx);
void (*finalize_process)(struct rspamd_task *task,
gpointer runtime, gpointer ctx);
- gboolean (*learn_token)(struct rspamd_task *task, struct token_node_s *tok,
- struct rspamd_token_result *res, gpointer ctx);
+ gboolean (*learn_tokens)(struct rspamd_task *task, GPtrArray *tokens,
+ gint id,
+ gpointer ctx);
gulong (*total_learns)(struct rspamd_task *task,
gpointer runtime, gpointer ctx);
void (*finalize_learn)(struct rspamd_task *task,
@@ -67,20 +69,19 @@ struct rspamd_stat_backend {
};
#define RSPAMD_STAT_BACKEND_DEF(name) \
- gpointer rspamd_##name##_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg); \
+ gpointer rspamd_##name##_init (struct rspamd_stat_ctx *ctx, \
+ struct rspamd_config *cfg, struct rspamd_statfile *st); \
gpointer rspamd_##name##_runtime (struct rspamd_task *task, \
struct rspamd_statfile_config *stcf, \
gboolean learn, gpointer ctx); \
- gboolean rspamd_##name##_process_token (struct rspamd_task *task, \
- struct token_node_s *tok, \
- struct rspamd_token_result *res, \
+ gboolean rspamd_##name##_process_tokens (struct rspamd_task *task, \
+ GPtrArray *tokens, gint id, \
gpointer ctx); \
void rspamd_##name##_finalize_process (struct rspamd_task *task, \
gpointer runtime, \
gpointer ctx); \
- gboolean rspamd_##name##_learn_token (struct rspamd_task *task, \
- struct token_node_s *tok, \
- struct rspamd_token_result *res, \
+ gboolean rspamd_##name##_learn_tokens (struct rspamd_task *task, \
+ GPtrArray *tokens, gint id, \
gpointer ctx); \
void rspamd_##name##_finalize_learn (struct rspamd_task *task, \
gpointer runtime, \
@@ -104,7 +105,6 @@ struct rspamd_stat_backend {
void rspamd_##name##_close (gpointer ctx)
RSPAMD_STAT_BACKEND_DEF(mmaped_file);
-RSPAMD_STAT_BACKEND_DEF(redis);
RSPAMD_STAT_BACKEND_DEF(sqlite3);
#endif /* BACKENDS_H_ */
diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c
index 2daf6e6cb..5e0b176ef 100644
--- a/src/libstat/backends/mmaped_file.c
+++ b/src/libstat/backends/mmaped_file.c
@@ -30,14 +30,6 @@
/* Section types */
#define STATFILE_SECTION_COMMON 1
-#define STATFILE_SECTION_HEADERS 2
-#define STATFILE_SECTION_URLS 3
-#define STATFILE_SECTION_REGEXP 4
-
-#define DEFAULT_STATFILE_INVALIDATE_TIME 30
-#define DEFAULT_STATFILE_INVALIDATE_JITTER 30
-
-#define MMAPED_BACKEND_TYPE "mmap"
/**
* Common statfile header
@@ -90,7 +82,8 @@ typedef struct {
#else
gchar filename[MAXPATHLEN]; /**< name of file */
#endif
- gint fd; /**< descriptor */
+ rspamd_mempool_t *pool;
+ gint fd; /**< descriptor */
void *map; /**< mmaped area */
off_t seek_pos; /**< current seek position */
struct stat_file_section cur_section; /**< current section */
@@ -98,34 +91,23 @@ typedef struct {
struct rspamd_statfile_config *cf;
} rspamd_mmaped_file_t;
-/**
- * Statfiles pool
- */
-typedef struct {
- GHashTable *files; /**< hash table of opened files indexed by name */
- rspamd_mempool_t *pool; /**< memory pool object */
- rspamd_mempool_mutex_t *lock; /**< mutex */
- gboolean mlock_ok; /**< whether it is possible to use mlock (2) to avoid statfiles unloading */
-} rspamd_mmaped_file_ctx;
#define RSPAMD_STATFILE_VERSION {'1', '2'}
#define BACKUP_SUFFIX ".old"
static void rspamd_mmaped_file_set_block_common (rspamd_mempool_t *pool,
- rspamd_mmaped_file_ctx *statfiles_pool, rspamd_mmaped_file_t *file,
+ rspamd_mmaped_file_t *file,
guint32 h1, guint32 h2, double value);
-rspamd_mmaped_file_t * rspamd_mmaped_file_is_open (
- rspamd_mmaped_file_ctx * pool, struct rspamd_statfile_config *stcf);
-rspamd_mmaped_file_t * rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool,
- const gchar *filename, size_t size, struct rspamd_statfile_config *stcf);
-gint rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool,
- const gchar *filename, size_t size, struct rspamd_statfile_config *stcf,
+rspamd_mmaped_file_t * rspamd_mmaped_file_open (rspamd_mempool_t *pool,
+ const gchar *filename, size_t size,
+ struct rspamd_statfile_config *stcf);
+gint rspamd_mmaped_file_create (const gchar *filename, size_t size,
+ struct rspamd_statfile_config *stcf,
rspamd_mempool_t *pool);
double
-rspamd_mmaped_file_get_block (rspamd_mmaped_file_ctx * pool,
- rspamd_mmaped_file_t * file,
+rspamd_mmaped_file_get_block (rspamd_mmaped_file_t * file,
guint32 h1,
guint32 h2)
{
@@ -159,8 +141,8 @@ rspamd_mmaped_file_get_block (rspamd_mmaped_file_ctx * pool,
static void
rspamd_mmaped_file_set_block_common (rspamd_mempool_t *pool,
- rspamd_mmaped_file_ctx *statfiles_pool, rspamd_mmaped_file_t *file,
- guint32 h1, guint32 h2, double value)
+ rspamd_mmaped_file_t *file,
+ guint32 h1, guint32 h2, double value)
{
struct stat_file_block *block, *to_expire = NULL;
struct stat_file_header *header;
@@ -240,24 +222,14 @@ rspamd_mmaped_file_set_block_common (rspamd_mempool_t *pool,
void
rspamd_mmaped_file_set_block (rspamd_mempool_t *pool,
- rspamd_mmaped_file_ctx * statfile_pool,
rspamd_mmaped_file_t * file,
guint32 h1,
guint32 h2,
double value)
{
- rspamd_mmaped_file_set_block_common (pool, statfile_pool, file, h1, h2, value);
-}
-
-rspamd_mmaped_file_t *
-rspamd_mmaped_file_is_open (rspamd_mmaped_file_ctx * pool,
- struct rspamd_statfile_config *stcf)
-{
- return g_hash_table_lookup (pool->files, stcf);
+ rspamd_mmaped_file_set_block_common (pool, file, h1, h2, value);
}
-
-
gboolean
rspamd_mmaped_file_set_revision (rspamd_mmaped_file_t *file, guint64 rev, time_t time)
{
@@ -421,11 +393,11 @@ rspamd_mmaped_file_check (rspamd_mempool_t *pool, rspamd_mmaped_file_t * file)
static rspamd_mmaped_file_t *
-rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
- const gchar *filename,
- size_t old_size,
- size_t size,
- struct rspamd_statfile_config *stcf)
+rspamd_mmaped_file_reindex (rspamd_mempool_t *pool,
+ const gchar *filename,
+ size_t old_size,
+ size_t size,
+ struct rspamd_statfile_config *stcf)
{
gchar *backup, *lock;
gint fd, lock_fd;
@@ -433,7 +405,6 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
u_char *map, *pos;
struct stat_file_block *block;
struct stat_file_header *header, *nh;
- rspamd_mempool_t *pool = statfiles_pool->pool;
if (size <
sizeof (struct stat_file_header) + sizeof (struct stat_file_section) +
@@ -454,11 +425,11 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
if (!rspamd_file_lock (lock_fd, FALSE)) {
g_free (lock);
- return rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf);
+ return rspamd_mmaped_file_open (pool, filename, size, stcf);
}
}
else {
- return rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf);
+ return rspamd_mmaped_file_open (pool, filename, size, stcf);
}
}
@@ -467,7 +438,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
unlink (lock);
g_free (lock);
- return rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf);
+ return rspamd_mmaped_file_open (pool, filename, size, stcf);
}
@@ -485,7 +456,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
}
/* Now create new file with required size */
- if (rspamd_mmaped_file_create (statfiles_pool, filename, size, stcf, statfiles_pool->pool) != 0) {
+ if (rspamd_mmaped_file_create (filename, size, stcf, pool) != 0) {
msg_err_pool ("cannot create new file");
g_free (backup);
unlink (lock);
@@ -497,7 +468,7 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
}
/* Now open new file and start copying */
fd = open (backup, O_RDONLY);
- new = rspamd_mmaped_file_open (statfiles_pool, filename, size, stcf);
+ new = rspamd_mmaped_file_open (pool, filename, size, stcf);
if (fd == -1 || new == NULL) {
if (fd != -1) {
@@ -532,8 +503,8 @@ rspamd_mmaped_file_reindex (rspamd_mmaped_file_ctx *statfiles_pool,
while (old_size - (pos - map) >= sizeof (struct stat_file_block)) {
block = (struct stat_file_block *)pos;
if (block->hash1 != 0 && block->value != 0) {
- rspamd_mmaped_file_set_block_common (statfiles_pool->pool,
- statfiles_pool, new, block->hash1,
+ rspamd_mmaped_file_set_block_common (pool,
+ new, block->hash1,
block->hash2, block->value);
}
pos += sizeof (block);
@@ -593,17 +564,13 @@ rspamd_mmaped_file_preload (rspamd_mmaped_file_t *file)
}
rspamd_mmaped_file_t *
-rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool,
+rspamd_mmaped_file_open (rspamd_mempool_t *pool,
const gchar *filename, size_t size,
struct rspamd_statfile_config *stcf)
{
struct stat st;
rspamd_mmaped_file_t *new_file;
- rspamd_mempool_t *pool = statfiles_pool->pool;
- if ((new_file = rspamd_mmaped_file_is_open (statfiles_pool, stcf)) != NULL) {
- return new_file;
- }
if (stat (filename, &st) == -1) {
msg_info_pool ("cannot stat file %s, error %s, %d", filename, strerror (
@@ -615,7 +582,7 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool,
&& size > sizeof (struct stat_file)) {
msg_warn_pool ("need to reindex statfile old size: %Hz, new size: %Hz",
(size_t)st.st_size, size);
- return rspamd_mmaped_file_reindex (statfiles_pool, filename, st.st_size, size, stcf);
+ return rspamd_mmaped_file_reindex (pool, filename, st.st_size, size, stcf);
}
else if (size < sizeof (struct stat_file)) {
msg_err_pool ("requested to shrink statfile to %Hz but it is too small",
@@ -648,17 +615,10 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool,
rspamd_strlcpy (new_file->filename, filename, sizeof (new_file->filename));
new_file->len = st.st_size;
/* Try to lock pages in RAM */
- if (statfiles_pool->mlock_ok) {
- if (mlock (new_file->map, new_file->len) == -1) {
- msg_warn_pool (
- "mlock of statfile failed, maybe you need to increase RLIMIT_MEMLOCK limit for a process: %s",
- strerror (errno));
- statfiles_pool->mlock_ok = FALSE;
- }
- }
+
/* Acquire lock for this operation */
rspamd_file_lock (new_file->fd, FALSE);
- if (rspamd_mmaped_file_check (statfiles_pool->pool, new_file) == -1) {
+ if (rspamd_mmaped_file_check (pool, new_file) == -1) {
rspamd_file_unlock (new_file->fd, FALSE);
munmap (new_file->map, st.st_size);
g_slice_free1 (sizeof (*new_file), new_file);
@@ -673,23 +633,13 @@ rspamd_mmaped_file_open (rspamd_mmaped_file_ctx *statfiles_pool,
g_assert (stcf->clcf != NULL);
- g_hash_table_insert (statfiles_pool->files, stcf, new_file);
-
return new_file;
}
gint
-rspamd_mmaped_file_close_file (rspamd_mmaped_file_ctx *statfile_pool,
+rspamd_mmaped_file_close_file (rspamd_mempool_t *pool,
rspamd_mmaped_file_t * file)
{
- rspamd_mmaped_file_t *pos;
- rspamd_mempool_t *pool = statfile_pool->pool;
-
- if ((pos = rspamd_mmaped_file_is_open (statfile_pool, file->cf)) == NULL) {
- msg_info_pool ("file %s is not opened", file->filename);
- return -1;
- }
-
if (file->map) {
msg_info_pool ("syncing statfile %s", file->filename);
msync (file->map, file->len, MS_ASYNC);
@@ -705,8 +655,10 @@ rspamd_mmaped_file_close_file (rspamd_mmaped_file_ctx *statfile_pool,
}
gint
-rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, const gchar *filename,
- size_t size, struct rspamd_statfile_config *stcf, rspamd_mempool_t *pool)
+rspamd_mmaped_file_create (const gchar *filename,
+ size_t size,
+ struct rspamd_statfile_config *stcf,
+ rspamd_mempool_t *pool)
{
struct stat_file_header header = {
.magic = {'r', 's', 'd'},
@@ -727,11 +679,6 @@ rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, const gchar *f
gpointer tok_conf;
gsize tok_conf_len;
- if (rspamd_mmaped_file_is_open (statfile_pool, stcf) != NULL) {
- msg_info_pool ("file %s is already opened", filename);
- return 0;
- }
-
if (size <
sizeof (struct stat_file_header) + sizeof (struct stat_file_section) +
sizeof (block)) {
@@ -838,103 +785,51 @@ rspamd_mmaped_file_create (rspamd_mmaped_file_ctx *statfile_pool, const gchar *f
}
gpointer
-rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg)
+rspamd_mmaped_file_init (struct rspamd_stat_ctx *ctx,
+ struct rspamd_config *cfg, struct rspamd_statfile *st)
{
- rspamd_mmaped_file_ctx *new;
- struct rspamd_classifier_config *clf;
- struct rspamd_statfile_config *stf;
- GList *cur, *curst;
+ struct rspamd_statfile_config *stf = st->stcf;
+ rspamd_mmaped_file_t *mf;
const ucl_object_t *filenameo, *sizeo;
const gchar *filename;
gsize size;
- new = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (rspamd_mmaped_file_ctx));
- new->pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), "statfiles");
- memcpy (new->pool->tag.uid, cfg->cfg_pool->tag.uid, sizeof (new->pool->tag.uid));
- new->lock = rspamd_mempool_get_mutex (new->pool);
- new->mlock_ok = cfg->mlock_statfile_pool;
- new->files = g_hash_table_new (g_direct_hash, g_direct_equal);
-
- /* Iterate over all classifiers and load matching statfiles */
- cur = cfg->classifiers;
+ filenameo = ucl_object_find_key (stf->opts, "filename");
- while (cur) {
- clf = cur->data;
+ if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
+ filenameo = ucl_object_find_key (stf->opts, "path");
- curst = clf->statfiles;
-
- if (clf->backend == NULL) {
- /*
- * By default, all statfiles are treated as mmaped files
- */
- clf->backend = MMAPED_BACKEND_TYPE;
+ if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
+ msg_err_config ("statfile %s has no filename defined", stf->symbol);
+ return NULL;
}
+ }
- if (strcmp (clf->backend, MMAPED_BACKEND_TYPE) == 0) {
- while (curst) {
- stf = curst->data;
- /*
- * Check configuration sanity
- */
- filenameo = ucl_object_find_key (stf->opts, "filename");
-
- if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
- filenameo = ucl_object_find_key (stf->opts, "path");
-
- if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
- msg_err_config ("statfile %s has no filename defined", stf->symbol);
- curst = curst->next;
- continue;
- }
- }
-
- filename = ucl_object_tostring (filenameo);
-
- sizeo = ucl_object_find_key (stf->opts, "size");
-
- if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) {
- msg_err_config ("statfile %s has no size defined", stf->symbol);
- curst = curst->next;
- continue;
- }
-
- size = ucl_object_toint (sizeo);
-
- rspamd_mmaped_file_open (new, filename, size, stf);
+ filename = ucl_object_tostring (filenameo);
- ctx->statfiles ++;
+ sizeo = ucl_object_find_key (stf->opts, "size");
- curst = curst->next;
- }
- }
-
- cur = g_list_next (cur);
+ if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) {
+ msg_err_config ("statfile %s has no size defined", stf->symbol);
+ return NULL;
}
- return (gpointer)new;
+ size = ucl_object_toint (sizeo);
+ mf = rspamd_mmaped_file_open (cfg->cfg_pool, filename, size, stf);
+ mf->pool = cfg->cfg_pool;
+
+ return (gpointer)mf;
}
void
rspamd_mmaped_file_close (gpointer p)
{
- rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p;
- GHashTableIter it;
- gpointer k, v;
- rspamd_mmaped_file_t *mf;
+ rspamd_mmaped_file_t *mf = p;
- g_assert (ctx != NULL);
+ g_assert (p != NULL);
- rspamd_mempool_lock_mutex (ctx->lock);
- g_hash_table_iter_init (&it, ctx->files);
+ rspamd_mmaped_file_close_file (mf->pool, mf);
- while (g_hash_table_iter_next (&it, &k, &v)) {
- mf = v;
- rspamd_mmaped_file_close_file (ctx, mf);
- }
-
- g_hash_table_unref (ctx->files);
- rspamd_mempool_unlock_mutex (ctx->lock);
- /* XXX: we don't delete pool here to avoid deadlocks */
}
gpointer
@@ -943,109 +838,55 @@ rspamd_mmaped_file_runtime (struct rspamd_task *task,
gboolean learn,
gpointer p)
{
- rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p;
- rspamd_mmaped_file_t *mf;
- const ucl_object_t *filenameo, *sizeo;
- const gchar *filename;
- gsize size;
-
- g_assert (ctx != NULL);
-
- mf = rspamd_mmaped_file_is_open (ctx, stcf);
-
- if (mf == NULL) {
- /* Create file here */
-
- filenameo = ucl_object_find_key (stcf->opts, "filename");
- if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
- filenameo = ucl_object_find_key (stcf->opts, "path");
- if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
- msg_err_task ("statfile %s has no filename defined", stcf->symbol);
- return NULL;
- }
- }
-
- filename = ucl_object_tostring (filenameo);
-
- sizeo = ucl_object_find_key (stcf->opts, "size");
- if (sizeo == NULL || ucl_object_type (sizeo) != UCL_INT) {
- msg_err_task ("statfile %s has no size defined", stcf->symbol);
- return NULL;
- }
-
- size = ucl_object_toint (sizeo);
-
- if (learn) {
- rspamd_mmaped_file_create (ctx, filename, size, stcf, task->task_pool);
- }
-
- mf = rspamd_mmaped_file_open (ctx, filename, size, stcf);
- }
+ rspamd_mmaped_file_t *mf = p;
return (gpointer)mf;
}
gboolean
-rspamd_mmaped_file_process_token (struct rspamd_task *task, rspamd_token_t *tok,
- struct rspamd_token_result *res,
+rspamd_mmaped_file_process_tokens (struct rspamd_task *task, GPtrArray *tokens,
+ gint id,
gpointer p)
{
- rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p;
- rspamd_mmaped_file_t *mf;
+ rspamd_mmaped_file_t *mf = p;
guint32 h1, h2;
+ rspamd_token_t *tok;
+ guint i;
- g_assert (res != NULL);
+ g_assert (tokens != NULL);
g_assert (p != NULL);
- g_assert (res->st_runtime != NULL);
- g_assert (tok != NULL);
- g_assert (tok->datalen >= sizeof (guint32) * 2);
- mf = (rspamd_mmaped_file_t *)res->st_runtime->backend_runtime;
-
- if (mf == NULL) {
- /* Statfile is does not exist, so all values are zero */
- res->value = 0.0;
- return FALSE;
- }
-
- memcpy (&h1, tok->data, sizeof (h1));
- memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
- res->value = rspamd_mmaped_file_get_block (ctx, mf, h1, h2);
-
- if (res->value > 0.0) {
- return TRUE;
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index (tokens, i);
+ memcpy (&h1, tok->data, sizeof (h1));
+ memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
+ tok->values[id] = rspamd_mmaped_file_get_block (mf, h1, h2);
}
- return FALSE;
+ return TRUE;
}
gboolean
-rspamd_mmaped_file_learn_token (struct rspamd_task *task, rspamd_token_t *tok,
- struct rspamd_token_result *res,
+rspamd_mmaped_file_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
+ gint id,
gpointer p)
{
- rspamd_mmaped_file_ctx *ctx = (rspamd_mmaped_file_ctx *)p;
- rspamd_mmaped_file_t *mf;
+ rspamd_mmaped_file_t *mf = p;
guint32 h1, h2;
+ rspamd_token_t *tok;
+ guint i;
- g_assert (res != NULL);
+ g_assert (tokens != NULL);
g_assert (p != NULL);
- g_assert (res->st_runtime != NULL);
- g_assert (tok != NULL);
- g_assert (tok->datalen >= sizeof (guint32) * 2);
-
- mf = (rspamd_mmaped_file_t *)res->st_runtime->backend_runtime;
- if (mf == NULL) {
- /* Statfile is does not exist, so all values are zero */
- res->value = 0.0;
- return FALSE;
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index (tokens, i);
+ memcpy (&h1, tok->data, sizeof (h1));
+ memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
+ rspamd_mmaped_file_set_block (task->task_pool, mf, h1, h2,
+ tok->values[id]);
}
- memcpy (&h1, tok->data, sizeof (h1));
- memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
- rspamd_mmaped_file_set_block (task->task_pool, ctx, mf, h1, h2, res->value);
-
return TRUE;
}
diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c
index 7b9cc8a42..3e05bee80 100644
--- a/src/libstat/backends/sqlite3_backend.c
+++ b/src/libstat/backends/sqlite3_backend.c
@@ -36,10 +36,11 @@
#define SQLITE3_DEFAULT "default"
struct rspamd_stat_sqlite3_db {
- struct rspamd_stat_sqlite3_ctx *ctx;
sqlite3 *sqlite;
gchar *fname;
GArray *prstmt;
+ lua_State *L;
+ rspamd_mempool_t *pool;
gboolean in_transaction;
gboolean enable_users;
gboolean enable_languages;
@@ -47,14 +48,7 @@ struct rspamd_stat_sqlite3_db {
gint cbref_language;
};
-struct rspamd_stat_sqlite3_ctx {
- GHashTable *files;
- rspamd_mempool_t *pool;
- lua_State *L;
-};
-
struct rspamd_stat_sqlite3_rt {
- struct rspamd_stat_sqlite3_ctx *ctx;
struct rspamd_task *task;
struct rspamd_stat_sqlite3_db *db;
struct rspamd_statfile_config *cf;
@@ -311,7 +305,7 @@ rspamd_sqlite3_get_user (struct rspamd_stat_sqlite3_db *db,
const gchar *user = NULL;
const InternetAddress *ia;
struct rspamd_task **ptask;
- lua_State *L = db->ctx->L;
+ lua_State *L = db->L;
GString *tb;
if (db->cbref_user == -1) {
@@ -391,7 +385,7 @@ rspamd_sqlite3_get_language (struct rspamd_stat_sqlite3_db *db,
const gchar *language = NULL;
struct mime_text_part *tp;
struct rspamd_task **ptask;
- lua_State *L = db->ctx->L;
+ lua_State *L = db->L;
GString *tb;
if (db->cbref_language == -1) {
@@ -471,6 +465,7 @@ rspamd_sqlite3_opendb (rspamd_mempool_t *pool,
bk = g_slice_alloc0 (sizeof (*bk));
bk->sqlite = rspamd_sqlite3_open_or_create (pool, path, create_tables_sql, err);
+ bk->pool = pool;
if (bk->sqlite == NULL) {
g_slice_free1 (sizeof (*bk), bk);
@@ -545,194 +540,146 @@ rspamd_sqlite3_opendb (rspamd_mempool_t *pool,
gpointer
rspamd_sqlite3_init (struct rspamd_stat_ctx *ctx,
- struct rspamd_config *cfg)
+ struct rspamd_config *cfg,
+ struct rspamd_statfile *st)
{
- struct rspamd_stat_sqlite3_ctx *new;
- struct rspamd_classifier_config *clf;
- struct rspamd_statfile_config *stf;
- GList *cur, *curst;
+ struct rspamd_classifier_config *clf = st->classifier->cfg;
+ struct rspamd_statfile_config *stf = st->stcf;
const ucl_object_t *filenameo, *lang_enabled, *users_enabled;
const gchar *filename, *lua_script;
struct rspamd_stat_sqlite3_db *bk;
GError *err = NULL;
- new = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*new));
- new->files = g_hash_table_new (g_direct_hash, g_direct_equal);
- new->pool = cfg->cfg_pool;
- new->L = cfg->lua_state;
-
- /* Iterate over all classifiers and load matching statfiles */
- cur = cfg->classifiers;
-
- while (cur) {
- clf = cur->data;
- if (clf->backend && strcmp (clf->backend, SQLITE3_BACKEND_TYPE) == 0) {
- curst = clf->statfiles;
-
- while (curst) {
- stf = curst->data;
- /*
- * Check configuration sanity
- */
- filenameo = ucl_object_find_key (stf->opts, "filename");
- if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
- filenameo = ucl_object_find_key (stf->opts, "path");
- if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
- msg_err_config ("statfile %s has no filename defined", stf->symbol);
- curst = curst->next;
- continue;
- }
- }
+ filenameo = ucl_object_find_key (stf->opts, "filename");
+ if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
+ filenameo = ucl_object_find_key (stf->opts, "path");
+ if (filenameo == NULL || ucl_object_type (filenameo) != UCL_STRING) {
+ msg_err_config ("statfile %s has no filename defined", stf->symbol);
+ return NULL;
+ }
+ }
- filename = ucl_object_tostring (filenameo);
+ filename = ucl_object_tostring (filenameo);
- if ((bk = rspamd_sqlite3_opendb (cfg->cfg_pool, stf, filename,
- stf->opts, TRUE, &err)) == NULL) {
- msg_err_config ("cannot open sqlite3 db: %e", err);
- }
+ if ((bk = rspamd_sqlite3_opendb (cfg->cfg_pool, stf, filename,
+ stf->opts, TRUE, &err)) == NULL) {
+ msg_err_config ("cannot open sqlite3 db: %e", err);
+ g_error_free (err);
+ return NULL;
+ }
- if (bk != NULL) {
- bk->ctx = new;
- g_hash_table_insert (new->files, stf, bk);
- }
- else {
- g_error_free (err);
- err = NULL;
- curst = curst->next;
- continue;
- }
+ bk->L = cfg->lua_state;
- users_enabled = ucl_object_find_any_key (clf->opts, "per_user",
- "users_enabled", NULL);
- if (users_enabled != NULL) {
- if (ucl_object_type (users_enabled) == UCL_BOOLEAN) {
- bk->enable_users = ucl_object_toboolean (users_enabled);
- bk->cbref_user = -1;
- }
- else if (ucl_object_type (users_enabled) == UCL_STRING) {
- lua_script = ucl_object_tostring (users_enabled);
-
- if (luaL_dostring (new->L, lua_script) != 0) {
- msg_err_config ("cannot execute lua script for users "
- "extraction: %s", lua_tostring (new->L, -1));
- }
- else {
- if (lua_type (new->L, -1) == LUA_TFUNCTION) {
- bk->enable_users = TRUE;
- bk->cbref_user = luaL_ref (new->L,
- LUA_REGISTRYINDEX);
- }
- else {
- msg_err_config ("lua script must return "
- "function(task) and not %s",
- lua_typename (new->L, lua_type (new->L, -1)));
- }
- }
- }
- }
- else {
- bk->enable_users = FALSE;
- }
+ users_enabled = ucl_object_find_any_key (clf->opts, "per_user",
+ "users_enabled", NULL);
+ if (users_enabled != NULL) {
+ if (ucl_object_type (users_enabled) == UCL_BOOLEAN) {
+ bk->enable_users = ucl_object_toboolean (users_enabled);
+ bk->cbref_user = -1;
+ }
+ else if (ucl_object_type (users_enabled) == UCL_STRING) {
+ lua_script = ucl_object_tostring (users_enabled);
- lang_enabled = ucl_object_find_any_key (clf->opts,
- "per_language", "languages_enabled", NULL);
- if (lang_enabled != NULL) {
- if (ucl_object_type (lang_enabled) == UCL_BOOLEAN) {
- bk->enable_languages = ucl_object_toboolean (lang_enabled);
- bk->cbref_language = -1;
- }
- else if (ucl_object_type (lang_enabled) == UCL_STRING) {
- lua_script = ucl_object_tostring (lang_enabled);
-
- if (luaL_dostring (new->L, lua_script) != 0) {
- msg_err_config (
- "cannot execute lua script for languages "
- "extraction: %s",
- lua_tostring (new->L, -1));
- }
- else {
- if (lua_type (new->L, -1) == LUA_TFUNCTION) {
- bk->enable_languages = TRUE;
- bk->cbref_language = luaL_ref (new->L,
- LUA_REGISTRYINDEX);
- }
- else {
- msg_err_config ("lua script must return "
- "function(task) and not %s",
- lua_typename (new->L,
- lua_type (new->L, -1)));
- }
- }
- }
+ if (luaL_dostring (cfg->lua_state, lua_script) != 0) {
+ msg_err_config ("cannot execute lua script for users "
+ "extraction: %s", lua_tostring (cfg->lua_state, -1));
+ }
+ else {
+ if (lua_type (cfg->lua_state, -1) == LUA_TFUNCTION) {
+ bk->enable_users = TRUE;
+ bk->cbref_user = luaL_ref (cfg->lua_state,
+ LUA_REGISTRYINDEX);
}
else {
- bk->enable_languages = FALSE;
+ msg_err_config ("lua script must return "
+ "function(task) and not %s",
+ lua_typename (cfg->lua_state, lua_type (
+ cfg->lua_state, -1)));
}
+ }
+ }
+ }
+ else {
+ bk->enable_users = FALSE;
+ }
- if (bk->enable_languages) {
- msg_info_config ("enable per language statistics for %s",
- stf->symbol);
- }
+ lang_enabled = ucl_object_find_any_key (clf->opts,
+ "per_language", "languages_enabled", NULL);
- if (bk->enable_users) {
- msg_info_config ("enable per users statistics for %s",
- stf->symbol);
+ if (lang_enabled != NULL) {
+ if (ucl_object_type (lang_enabled) == UCL_BOOLEAN) {
+ bk->enable_languages = ucl_object_toboolean (lang_enabled);
+ bk->cbref_language = -1;
+ }
+ else if (ucl_object_type (lang_enabled) == UCL_STRING) {
+ lua_script = ucl_object_tostring (lang_enabled);
+
+ if (luaL_dostring (cfg->lua_state, lua_script) != 0) {
+ msg_err_config (
+ "cannot execute lua script for languages "
+ "extraction: %s",
+ lua_tostring (cfg->lua_state, -1));
+ }
+ else {
+ if (lua_type (cfg->lua_state, -1) == LUA_TFUNCTION) {
+ bk->enable_languages = TRUE;
+ bk->cbref_language = luaL_ref (cfg->lua_state,
+ LUA_REGISTRYINDEX);
+ }
+ else {
+ msg_err_config ("lua script must return "
+ "function(task) and not %s",
+ lua_typename (cfg->lua_state,
+ lua_type (cfg->lua_state, -1)));
}
-
- ctx->statfiles ++;
-
- curst = curst->next;
}
}
+ }
+ else {
+ bk->enable_languages = FALSE;
+ }
- cur = g_list_next (cur);
+ if (bk->enable_languages) {
+ msg_info_config ("enable per language statistics for %s",
+ stf->symbol);
}
- return (gpointer)new;
+ if (bk->enable_users) {
+ msg_info_config ("enable per users statistics for %s",
+ stf->symbol);
+ }
+
+
+ return (gpointer) bk;
}
void
rspamd_sqlite3_close (gpointer p)
{
- struct rspamd_stat_sqlite3_ctx *ctx = p;
- struct rspamd_stat_sqlite3_db *bk;
- GHashTableIter it;
- gpointer k, v;
-
- g_hash_table_iter_init (&it, ctx->files);
-
- while (g_hash_table_iter_next (&it, &k, &v)) {
- bk = v;
+ struct rspamd_stat_sqlite3_db *bk = p;
- if (bk->sqlite) {
- if (bk->in_transaction) {
- rspamd_sqlite3_run_prstmt (ctx->pool, bk->sqlite, bk->prstmt,
- RSPAMD_STAT_BACKEND_TRANSACTION_COMMIT);
- }
-
- rspamd_sqlite3_close_prstmt (bk->sqlite, bk->prstmt);
- sqlite3_close (bk->sqlite);
- g_free (bk->fname);
- g_slice_free1 (sizeof (*bk), bk);
+ if (bk->sqlite) {
+ if (bk->in_transaction) {
+ rspamd_sqlite3_run_prstmt (bk->pool, bk->sqlite, bk->prstmt,
+ RSPAMD_STAT_BACKEND_TRANSACTION_COMMIT);
}
- }
- g_hash_table_destroy (ctx->files);
+ rspamd_sqlite3_close_prstmt (bk->sqlite, bk->prstmt);
+ sqlite3_close (bk->sqlite);
+ g_free (bk->fname);
+ g_slice_free1 (sizeof (*bk), bk);
+ }
}
gpointer
rspamd_sqlite3_runtime (struct rspamd_task *task,
struct rspamd_statfile_config *stcf, gboolean learn, gpointer p)
{
- struct rspamd_stat_sqlite3_ctx *ctx = p;
struct rspamd_stat_sqlite3_rt *rt = NULL;
- struct rspamd_stat_sqlite3_db *bk;
-
- bk = g_hash_table_lookup (ctx->files, stcf);
+ struct rspamd_stat_sqlite3_db *bk = p;
if (bk) {
rt = rspamd_mempool_alloc (task->task_pool, sizeof (*rt));
- rt->ctx = ctx;
rt->db = bk;
rt->task = task;
rt->user_id = -1;
@@ -744,66 +691,64 @@ rspamd_sqlite3_runtime (struct rspamd_task *task,
}
gboolean
-rspamd_sqlite3_process_token (struct rspamd_task *task, struct token_node_s *tok,
- struct rspamd_token_result *res, gpointer p)
+rspamd_sqlite3_process_tokens (struct rspamd_task *task,
+ GPtrArray *tokens,
+ gint id, gpointer p)
{
struct rspamd_stat_sqlite3_db *bk;
- struct rspamd_stat_sqlite3_rt *rt;
+ struct rspamd_stat_sqlite3_rt *rt = p;
gint64 iv = 0, idx;
+ guint i;
+ rspamd_token_t *tok;
- g_assert (res != NULL);
g_assert (p != NULL);
- g_assert (res->st_runtime != NULL);
- g_assert (tok != NULL);
- g_assert (tok->datalen >= sizeof (guint32) * 2);
+ g_assert (tokens != NULL);
- rt = res->st_runtime->backend_runtime;
bk = rt->db;
- if (bk == NULL) {
- /* Statfile is does not exist, so all values are zero */
- res->value = 0.0;
- return FALSE;
- }
-
- if (!bk->in_transaction) {
- rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
- RSPAMD_STAT_BACKEND_TRANSACTION_START_DEF);
- bk->in_transaction = TRUE;
- }
+ for (i = 0; i < tokens->len; i ++) {
+ tok = g_ptr_array_index (tokens, i);
- if (rt->user_id == -1) {
- if (bk->enable_users) {
- rt->user_id = rspamd_sqlite3_get_user (bk, task, FALSE);
- }
- else {
- rt->user_id = 0;
+ if (bk == NULL) {
+ /* Statfile is does not exist, so all values are zero */
+ tok->values[id] = 0.0;
+ continue;
}
- }
- if (rt->lang_id == -1) {
- if (bk->enable_languages) {
- rt->lang_id = rspamd_sqlite3_get_language (bk, task, FALSE);
+ if (!bk->in_transaction) {
+ rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
+ RSPAMD_STAT_BACKEND_TRANSACTION_START_DEF);
+ bk->in_transaction = TRUE;
}
- else {
- rt->lang_id = 0;
+
+ if (rt->user_id == -1) {
+ if (bk->enable_users) {
+ rt->user_id = rspamd_sqlite3_get_user (bk, task, FALSE);
+ }
+ else {
+ rt->user_id = 0;
+ }
}
- }
- memcpy (&idx, tok->data, sizeof (idx));
+ if (rt->lang_id == -1) {
+ if (bk->enable_languages) {
+ rt->lang_id = rspamd_sqlite3_get_language (bk, task, FALSE);
+ }
+ else {
+ rt->lang_id = 0;
+ }
+ }
- if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
- RSPAMD_STAT_BACKEND_GET_TOKEN,
- idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) {
- res->value = iv;
+ memcpy (&idx, tok->data, sizeof (idx));
- if (iv == 0) {
- return FALSE;
+ if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
+ RSPAMD_STAT_BACKEND_GET_TOKEN,
+ idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) {
+ tok->values[id] = iv;
+ }
+ else {
+ tok->values[id] = 0.0;
}
- }
- else {
- res->value = 0.0;
- return FALSE;
}
@@ -833,58 +778,63 @@ rspamd_sqlite3_finalize_process (struct rspamd_task *task, gpointer runtime,
}
gboolean
-rspamd_sqlite3_learn_token (struct rspamd_task *task, struct token_node_s *tok,
- struct rspamd_token_result *res, gpointer p)
+rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
+ gint id, gpointer p)
{
struct rspamd_stat_sqlite3_db *bk;
- struct rspamd_stat_sqlite3_rt *rt;
+ struct rspamd_stat_sqlite3_rt *rt = p;
gint64 iv = 0, idx;
+ guint i;
+ rspamd_token_t *tok;
- g_assert (res != NULL);
+ g_assert (tokens != NULL);
g_assert (p != NULL);
- g_assert (res->st_runtime != NULL);
- g_assert (tok != NULL);
- g_assert (tok->datalen >= sizeof (guint32) * 2);
- rt = res->st_runtime->backend_runtime;
bk = rt->db;
- if (bk == NULL) {
- /* Statfile is does not exist, so all values are zero */
- return FALSE;
- }
-
- if (!bk->in_transaction) {
- rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
- RSPAMD_STAT_BACKEND_TRANSACTION_START_IM);
- bk->in_transaction = TRUE;
- }
-
- if (rt->user_id == -1) {
- if (bk->enable_users) {
- rt->user_id = rspamd_sqlite3_get_user (bk, task, TRUE);
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index (tokens, i);
+ if (bk == NULL) {
+ /* Statfile is does not exist, so all values are zero */
+ return FALSE;
}
- else {
- rt->user_id = 0;
+
+ if (!bk->in_transaction) {
+ rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
+ RSPAMD_STAT_BACKEND_TRANSACTION_START_IM);
+ bk->in_transaction = TRUE;
}
- }
- if (rt->lang_id == -1) {
- if (bk->enable_languages) {
- rt->lang_id = rspamd_sqlite3_get_language (bk, task, TRUE);
+ if (rt->user_id == -1) {
+ if (bk->enable_users) {
+ rt->user_id = rspamd_sqlite3_get_user (bk, task, TRUE);
+ }
+ else {
+ rt->user_id = 0;
+ }
}
- else {
- rt->lang_id = 0;
+
+ if (rt->lang_id == -1) {
+ if (bk->enable_languages) {
+ rt->lang_id = rspamd_sqlite3_get_language (bk, task, TRUE);
+ }
+ else {
+ rt->lang_id = 0;
+ }
}
- }
- iv = res->value;
- memcpy (&idx, tok->data, sizeof (idx));
+ iv = tok->values[id];
+ memcpy (&idx, tok->data, sizeof (idx));
+
+ if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
+ RSPAMD_STAT_BACKEND_SET_TOKEN,
+ idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) {
+ rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
+ RSPAMD_STAT_BACKEND_TRANSACTION_ROLLBACK);
+ bk->in_transaction = FALSE;
- if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
- RSPAMD_STAT_BACKEND_SET_TOKEN,
- idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) {
- return FALSE;
+ return FALSE;
+ }
}
return TRUE;
@@ -1024,7 +974,7 @@ rspamd_sqlite3_get_stat (gpointer runtime,
g_assert (rt != NULL);
bk = rt->db;
- pool = rt->ctx->pool;
+ pool = bk->pool;
(void)stat (bk->fname, &st);
rspamd_sqlite3_run_prstmt (pool, bk->sqlite, bk->prstmt,
@@ -1072,7 +1022,7 @@ rspamd_sqlite3_load_tokenizer_config (gpointer runtime,
g_assert (rt != NULL);
bk = rt->db;
- g_assert (rspamd_sqlite3_run_prstmt (rt->ctx->pool, bk->sqlite, bk->prstmt,
+ g_assert (rspamd_sqlite3_run_prstmt (rt->db->pool, bk->sqlite, bk->prstmt,
RSPAMD_STAT_BACKEND_LOAD_TOKENIZER, &sz, &tk_conf) == SQLITE_OK);
g_assert (sz > 0);
/*
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 180aa4658..0915933f1 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -90,7 +90,10 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg)
}
struct bayes_task_closure {
- struct rspamd_classifier_runtime *rt;
+ double ham_prob;
+ double spam_prob;
+ guint64 processed_tokens;
+ guint64 total_hits;
struct rspamd_task *task;
};
@@ -104,44 +107,46 @@ static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 }
/*
* In this callback we calculate local probabilities for tokens
*/
-static gboolean
-bayes_classify_callback (gpointer key, gpointer value, gpointer data)
+static void
+bayes_classify_token (struct rspamd_classifier *ctx,
+ rspamd_token_t *tok, struct bayes_task_closure *cl)
{
- rspamd_token_t *node = value;
- struct bayes_task_closure *cl = data;
- struct rspamd_classifier_runtime *rt;
guint i;
- struct rspamd_token_result *res;
+ gint id;
guint64 spam_count = 0, ham_count = 0, total_count = 0;
+ struct rspamd_statfile *st;
struct rspamd_task *task;
double spam_prob, spam_freq, ham_freq, bayes_spam_prob, bayes_ham_prob,
- ham_prob, fw, w, norm_sum, norm_sub;
+ ham_prob, fw, w, norm_sum, norm_sub, val;
- rt = cl->rt;
task = cl->task;
- for (i = rt->start_pos; i < rt->end_pos; i++) {
- res = &g_array_index (node->results, struct rspamd_token_result, i);
+ for (i = 0; i < ctx->statfiles_ids->len; i++) {
+ id = g_array_index (ctx->statfiles_ids, gint, i);
+ st = g_ptr_array_index (ctx->ctx->statfiles, id);
+ g_assert (st != NULL);
+ val = tok->values[id];
- if (res->value > 0) {
- if (res->st_runtime->st->is_spam) {
- spam_count += res->value;
+ if (val > 0) {
+ if (st->stcf->is_spam) {
+ spam_count += val;
}
else {
- ham_count += res->value;
+ ham_count += val;
}
- total_count += res->value;
- res->st_runtime->total_hits += res->value;
+
+ total_count += val;
+ cl->total_hits += val;
}
}
/* Probability for this token */
if (total_count > 0) {
- spam_freq = ((double)spam_count / MAX (1., (double)rt->total_spam));
- ham_freq = ((double)ham_count / MAX (1., (double)rt->total_ham));
+ spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns));
+ ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
spam_prob = spam_freq / (spam_freq + ham_freq);
ham_prob = ham_freq / (spam_freq + ham_freq);
- fw = feature_weight[node->window_idx % G_N_ELEMENTS (feature_weight)];
+ fw = feature_weight[tok->window_idx % G_N_ELEMENTS (feature_weight)];
norm_sum = (spam_freq + ham_freq) * (spam_freq + ham_freq);
norm_sub = (spam_freq - ham_freq) * (spam_freq - ham_freq);
w = (norm_sub) / (norm_sum) *
@@ -151,9 +156,9 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
w = (norm_sub) / (norm_sum) *
(fw * total_count) / (4.0 * (1.0 + fw * total_count));
bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5);
- rt->spam_prob += log (bayes_spam_prob);
- rt->ham_prob += log (bayes_ham_prob);
- res->cl_runtime->processed_tokens ++;
+ cl->spam_prob += log (bayes_spam_prob);
+ cl->ham_prob += log (bayes_ham_prob);
+ cl->processed_tokens ++;
msg_debug_bayes ("token: weight: %f, total_count: %L, "
"spam_count: %L, ham_count: %L,"
@@ -163,10 +168,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
fw, total_count, spam_count, ham_count,
spam_prob, ham_prob,
bayes_spam_prob, bayes_ham_prob,
- rt->spam_prob, rt->ham_prob);
+ cl->spam_prob, cl->ham_prob);
}
-
- return FALSE;
}
/*
@@ -191,191 +194,153 @@ bayes_normalize_prob (gdouble x)
return a*x4 + b*x3 + c*x2 + d*xx;
}
-struct classifier_ctx *
-bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
+void
+bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier *cl)
{
- struct classifier_ctx *ctx =
- rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
-
- ctx->pool = pool;
- ctx->cfg = cfg;
- ctx->debug = FALSE;
-
- return ctx;
}
gboolean
-bayes_classify (struct classifier_ctx * ctx,
- GTree *input,
- struct rspamd_classifier_runtime *rt,
- struct rspamd_task *task)
+bayes_classify (struct rspamd_classifier * ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task)
{
double final_prob, h, s;
- guint maxhits = 0;
- struct rspamd_statfile_runtime *st, *selected_st = NULL;
- GList *cur;
char *sumbuf;
+ struct rspamd_statfile *st = NULL;
struct bayes_task_closure cl;
+ rspamd_token_t *tok;
+ guint i;
+ gint id;
+ GList *cur;
g_assert (ctx != NULL);
- g_assert (input != NULL);
- g_assert (rt != NULL);
- g_assert (rt->end_pos > rt->start_pos);
-
- if (rt->stage == RSPAMD_STAT_STAGE_PRE) {
- cl.rt = rt;
- cl.task = task;
- g_tree_foreach (input, bayes_classify_callback, &cl);
+ g_assert (tokens != NULL);
+
+ memset (&cl, 0, sizeof (cl));
+ cl.task = task;
+
+ for (i = 0; i < tokens->len; i ++) {
+ tok = g_ptr_array_index (tokens, i);
+
+ bayes_classify_token (ctx, tok, &cl);
+ }
+
+ h = 1 - inv_chi_square (task, cl.spam_prob, cl.processed_tokens);
+ s = 1 - inv_chi_square (task, cl.ham_prob, cl.processed_tokens);
+
+ if (isfinite (s) && isfinite (h)) {
+ final_prob = (s + 1.0 - h) / 2.;
+ msg_debug_bayes (
+ "<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
+ " %L tokens processed of %ud total tokens",
+ task->message_id,
+ cl.ham_prob,
+ h,
+ cl.spam_prob,
+ s,
+ cl.processed_tokens,
+ tokens->len);
}
else {
- h = 1 - inv_chi_square (task, rt->spam_prob, rt->processed_tokens);
- s = 1 - inv_chi_square (task, rt->ham_prob, rt->processed_tokens);
-
- if (isfinite (s) && isfinite (h)) {
- final_prob = (s + 1.0 - h) / 2.;
- msg_debug_bayes ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
- " %L tokens processed of %ud total tokens",
- task->message_id, rt->ham_prob, h, rt->spam_prob, s,
- rt->processed_tokens, g_tree_nnodes (input));
+ /*
+ * We have some overflow, hence we need to check which class
+ * is NaN
+ */
+ if (isfinite (h)) {
+ final_prob = 1.0;
+ msg_debug_bayes ("<%s> spam class is overflowed, as we have no"
+ " ham samples", task->message_id);
+ }
+ else if (isfinite (s)) {
+ final_prob = 0.0;
+ msg_debug_bayes ("<%s> ham class is overflowed, as we have no"
+ " spam samples", task->message_id);
}
else {
- /*
- * We have some overflow, hence we need to check which class
- * is NaN
- */
- if (isfinite (h)) {
- final_prob = 1.0;
- msg_debug_bayes ("<%s> spam class is overflowed, as we have no"
- " ham samples", task->message_id);
- }
- else if (isfinite (s)){
- final_prob = 0.0;
- msg_debug_bayes ("<%s> ham class is overflowed, as we have no"
- " spam samples", task->message_id);
- }
- else {
- final_prob = 0.5;
- msg_warn_bayes ("<%s> spam and ham classes are both overflowed",
- task->message_id);
- }
+ final_prob = 0.5;
+ msg_warn_bayes ("<%s> spam and ham classes are both overflowed",
+ task->message_id);
}
+ }
- if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
+ if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
- sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- cur = g_list_first (rt->st_runtime);
+ sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- while (cur) {
- st = (struct rspamd_statfile_runtime *)cur->data;
+ /* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */
+ for (i = 0; i < ctx->statfiles_ids->len; i++) {
+ id = g_array_index (ctx->statfiles_ids, gint, i);
+ st = g_ptr_array_index (ctx->ctx->statfiles, id);
- if ((final_prob < 0.5 && !st->st->is_spam) ||
- (final_prob > 0.5 && st->st->is_spam)) {
- if (st->total_hits > maxhits) {
- maxhits = st->total_hits;
- selected_st = st;
- }
- }
-
- cur = g_list_next (cur);
+ if (final_prob > 0.5 && st->stcf->is_spam) {
+ break;
}
-
- if (selected_st == NULL) {
- msg_err_bayes (
- "unexpected classifier error: cannot select desired statfile, "
- "prob: %.4f", final_prob);
+ else if (final_prob < 0.5 && !st->stcf->is_spam) {
+ break;
}
- else {
- /* Correctly scale HAM */
- if (final_prob < 0.5) {
- final_prob = 1.0 - final_prob;
- }
-
- rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
- final_prob = bayes_normalize_prob (final_prob);
+ }
- cur = g_list_prepend (NULL, sumbuf);
- rspamd_task_insert_result (task,
- selected_st->st->symbol,
- final_prob,
- cur);
- }
+ /* Correctly scale HAM */
+ if (final_prob < 0.5) {
+ final_prob = 1.0 - final_prob;
}
+
+ rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
+ final_prob = bayes_normalize_prob (final_prob);
+ g_assert (st != NULL);
+ cur = g_list_prepend (NULL, sumbuf);
+ rspamd_task_insert_result (task,
+ st->stcf->symbol,
+ final_prob,
+ cur);
}
return TRUE;
}
-static gboolean
-bayes_learn_spam_callback (gpointer key, gpointer value, gpointer data)
+gboolean
+bayes_learn_spam (struct rspamd_classifier * ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ GError **err)
{
- rspamd_token_t *node = value;
- struct rspamd_token_result *res;
- struct rspamd_classifier_runtime *rt = (struct rspamd_classifier_runtime *)data;
- guint i;
+ guint i, j;
+ gint id;
+ struct rspamd_statfile *st;
+ rspamd_token_t *tok;
+ g_assert (ctx != NULL);
+ g_assert (tokens != NULL);
- for (i = rt->start_pos; i < rt->end_pos; i++) {
- res = &g_array_index (node->results, struct rspamd_token_result, i);
-
- if (res->st_runtime) {
- if (res->st_runtime->st->is_spam) {
- res->value ++;
- }
- else if (res->value > 0) {
- /* Unlearning */
- res->value --;
- }
- }
- }
-
- return FALSE;
-}
-
-static gboolean
-bayes_learn_ham_callback (gpointer key, gpointer value, gpointer data)
-{
- rspamd_token_t *node = value;
- struct rspamd_token_result *res;
- struct rspamd_classifier_runtime *rt = (struct rspamd_classifier_runtime *)data;
- guint i;
-
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index (tokens, i);
- for (i = rt->start_pos; i < rt->end_pos; i++) {
- res = &g_array_index (node->results, struct rspamd_token_result, i);
+ for (j = 0; j < ctx->statfiles_ids->len; j++) {
+ id = g_array_index (ctx->statfiles_ids, gint, j);
+ st = g_ptr_array_index (ctx->ctx->statfiles, id);
+ g_assert (st != NULL);
- if (res->st_runtime) {
- if (!res->st_runtime->st->is_spam) {
- res->value ++;
+ if (is_spam) {
+ if (st->stcf->is_spam) {
+ tok->values[id]++;
+ }
+ else if (tok->values[id] > 0) {
+ /* Unlearning */
+ tok->values[id]--;
+ }
}
- else if (res->value > 0) {
- res->value --;
+ else {
+ if (!st->stcf->is_spam) {
+ tok->values[id]++;
+ }
+ else if (tok->values[id] > 0) {
+ /* Unlearning */
+ tok->values[id]--;
+ }
}
}
}
- return FALSE;
-}
-
-gboolean
-bayes_learn_spam (struct classifier_ctx * ctx,
- GTree *input,
- struct rspamd_classifier_runtime *rt,
- struct rspamd_task *task,
- gboolean is_spam,
- GError **err)
-{
- g_assert (ctx != NULL);
- g_assert (input != NULL);
- g_assert (rt != NULL);
- g_assert (rt->end_pos > rt->start_pos);
-
- if (is_spam) {
- g_tree_foreach (input, bayes_learn_spam_callback, rt);
- }
- else {
- g_tree_foreach (input, bayes_learn_ham_callback, rt);
- }
-
-
return TRUE;
}
diff --git a/src/libstat/classifiers/classifiers.h b/src/libstat/classifiers/classifiers.h
index 9a30039df..86395c96d 100644
--- a/src/libstat/classifiers/classifiers.h
+++ b/src/libstat/classifiers/classifiers.h
@@ -4,49 +4,40 @@
#include "config.h"
#include "mem_pool.h"
+#define RSPAMD_DEFAULT_CLASSIFIER "bayes"
/* Consider this value as 0 */
#define ALPHA 0.0001
struct rspamd_classifier_config;
struct rspamd_task;
-
-/* Common classifier structure */
-struct classifier_ctx {
- rspamd_mempool_t *pool;
- GHashTable *results;
- gboolean debug;
- struct rspamd_classifier_config *cfg;
-};
+struct rspamd_classifier;
struct token_node_s;
-struct rspamd_classifier_runtime;
struct rspamd_stat_classifier {
char *name;
- struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
- struct rspamd_classifier_config *cf);
- gboolean (*classify_func)(struct classifier_ctx * ctx,
- GTree *input, struct rspamd_classifier_runtime *rt,
- struct rspamd_task *task);
- gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
- GTree *input, struct rspamd_classifier_runtime *rt,
- struct rspamd_task *task, gboolean is_spam,
- GError **err);
+ void (*init_func)(rspamd_mempool_t *pool,
+ struct rspamd_classifier *cl);
+ gboolean (*classify_func)(struct rspamd_classifier * ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task);
+ gboolean (*learn_spam_func)(struct rspamd_classifier * ctx,
+ GPtrArray *input,
+ struct rspamd_task *task, gboolean is_spam,
+ GError **err);
};
/* Bayes algorithm */
-struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
- struct rspamd_classifier_config *cf);
-gboolean bayes_classify (struct classifier_ctx * ctx,
- GTree *input,
- struct rspamd_classifier_runtime *rt,
- struct rspamd_task *task);
-gboolean bayes_learn_spam (struct classifier_ctx * ctx,
- GTree *input,
- struct rspamd_classifier_runtime *rt,
- struct rspamd_task *task,
- gboolean is_spam,
- GError **err);
+void bayes_init (rspamd_mempool_t *pool,
+ struct rspamd_classifier *);
+gboolean bayes_classify (struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task);
+gboolean bayes_learn_spam (struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ GError **err);
#endif
/*
diff --git a/src/libstat/learn_cache/learn_cache.h b/src/libstat/learn_cache/learn_cache.h
index 273e906f5..84851649f 100644
--- a/src/libstat/learn_cache/learn_cache.h
+++ b/src/libstat/learn_cache/learn_cache.h
@@ -36,7 +36,8 @@ struct rspamd_config;
struct rspamd_stat_cache {
const char *name;
- gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg);
+ gpointer (*init)(struct rspamd_stat_ctx *ctx,
+ struct rspamd_config *cfg, const ucl_object_t *cf);
gint (*process)(struct rspamd_task *task,
gboolean is_spam,
gpointer ctx);
@@ -45,7 +46,8 @@ struct rspamd_stat_cache {
};
gpointer rspamd_stat_cache_sqlite3_init(struct rspamd_stat_ctx *ctx,
- struct rspamd_config *cfg);
+ struct rspamd_config *cfg,
+ const ucl_object_t *cf);
gint rspamd_stat_cache_sqlite3_process (
struct rspamd_task *task,
gboolean is_spam, gpointer c);
diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c
index cf4ab615a..9590d00b5 100644
--- a/src/libstat/learn_cache/sqlite3_cache.c
+++ b/src/libstat/learn_cache/sqlite3_cache.c
@@ -121,71 +121,48 @@ struct rspamd_stat_sqlite3_ctx {
gpointer
rspamd_stat_cache_sqlite3_init(struct rspamd_stat_ctx *ctx,
- struct rspamd_config *cfg)
+ struct rspamd_config *cfg,
+ const ucl_object_t *cf)
{
struct rspamd_stat_sqlite3_ctx *new = NULL;
- struct rspamd_classifier_config *clf;
- const ucl_object_t *obj, *elt;
- GList *cur;
+ const ucl_object_t *elt;
gchar dbpath[PATH_MAX];
+ const gchar *path = SQLITE_CACHE_PATH;
sqlite3 *sqlite;
- gboolean has_sqlite_cache = FALSE;
GError *err = NULL;
- rspamd_snprintf (dbpath, sizeof (dbpath), SQLITE_CACHE_PATH);
- cur = cfg->classifiers;
- while (cur) {
- clf = cur->data;
+ if (cf) {
+ elt = ucl_object_find_key (cf, "path");
- obj = ucl_object_find_key (clf->opts, "cache");
-
- /* Sqlite3 cache is the default learn cache method */
- if (obj == NULL) {
- has_sqlite_cache = TRUE;
- break;
+ if (elt != NULL) {
+ path = ucl_object_tostring (elt);
}
- else if (ucl_object_type (obj) == UCL_OBJECT) {
- elt = ucl_object_find_key (obj, "name");
+ }
- if (ucl_object_type (elt) == UCL_STRING &&
- g_ascii_strcasecmp (ucl_object_tostring (elt), "sqlite3") == 0) {
+ rspamd_snprintf (dbpath, sizeof (dbpath), "%s", path);
- has_sqlite_cache = TRUE;
- elt = ucl_object_find_key (obj, "path");
- if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
- rspamd_snprintf (dbpath, sizeof (dbpath), "%s",
- ucl_object_tostring (elt));
- }
- }
- }
+ sqlite = rspamd_sqlite3_open_or_create (cfg->cfg_pool,
+ dbpath, create_tables_sql, &err);
- cur = g_list_next (cur);
+ if (sqlite == NULL) {
+ msg_err ("cannot open sqlite3 cache: %e", err);
+ g_error_free (err);
+ err = NULL;
}
+ else {
+ new = g_slice_alloc (sizeof (*new));
+ new->db = sqlite;
+ new->prstmt = rspamd_sqlite3_init_prstmt (sqlite, prepared_stmts,
+ RSPAMD_STAT_CACHE_MAX, &err);
- if (has_sqlite_cache) {
- sqlite = rspamd_sqlite3_open_or_create (cfg->cfg_pool,
- dbpath, create_tables_sql, &err);
-
- if (sqlite == NULL) {
- msg_err_config ("cannot open sqlite3 cache: %e", err);
+ if (new->prstmt == NULL) {
+ msg_err ("cannot open sqlite3 cache: %e", err);
g_error_free (err);
err = NULL;
- }
- else {
- new = g_slice_alloc (sizeof (*new));
- new->db = sqlite;
- new->prstmt = rspamd_sqlite3_init_prstmt (sqlite, prepared_stmts,
- RSPAMD_STAT_CACHE_MAX, &err);
-
- if (new->prstmt == NULL) {
- msg_err_config ("cannot open sqlite3 cache: %e", err);
- g_error_free (err);
- err = NULL;
- sqlite3_close (sqlite);
- g_slice_free1 (sizeof (*new), new);
- new = NULL;
- }
+ sqlite3_close (sqlite);
+ g_slice_free1 (sizeof (*new), new);
+ new = NULL;
}
}
diff --git a/src/libstat/stat_config.c b/src/libstat/stat_config.c
index dbfe16c27..1f16a98de 100644
--- a/src/libstat/stat_config.c
+++ b/src/libstat/stat_config.c
@@ -44,18 +44,12 @@ static struct rspamd_stat_tokenizer stat_tokenizers[] = {
{
.name = "osb-text",
.get_config = rspamd_tokenizer_osb_get_config,
- .compatible_config = rspamd_tokenizer_osb_compatible_config,
.tokenize_func = rspamd_tokenizer_osb,
- .load_config = rspamd_tokenizer_osb_load_config,
- .is_compat = rspamd_tokenizer_osb_is_compat
},
{
.name = "osb",
.get_config = rspamd_tokenizer_osb_get_config,
- .compatible_config = rspamd_tokenizer_osb_compatible_config,
.tokenize_func = rspamd_tokenizer_osb,
- .load_config = rspamd_tokenizer_osb_load_config,
- .is_compat = rspamd_tokenizer_osb_is_compat
},
};
@@ -63,9 +57,9 @@ static struct rspamd_stat_tokenizer stat_tokenizers[] = {
.name = #nam, \
.init = rspamd_##eltn##_init, \
.runtime = rspamd_##eltn##_runtime, \
- .process_token = rspamd_##eltn##_process_token, \
+ .process_tokens = rspamd_##eltn##_process_tokens, \
.finalize_process = rspamd_##eltn##_finalize_process, \
- .learn_token = rspamd_##eltn##_learn_token, \
+ .learn_tokens = rspamd_##eltn##_learn_tokens, \
.finalize_learn = rspamd_##eltn##_finalize_learn, \
.total_learns = rspamd_##eltn##_total_learns, \
.inc_learns = rspamd_##eltn##_inc_learns, \
@@ -92,32 +86,106 @@ static struct rspamd_stat_cache stat_caches[] = {
void
rspamd_stat_init (struct rspamd_config *cfg)
{
- guint i;
+ GList *cur, *curst;
+ struct rspamd_classifier_config *clf;
+ struct rspamd_statfile_config *stf;
+ struct rspamd_stat_backend *bk;
+ struct rspamd_statfile *st;
+ struct rspamd_classifier *cl;
+ const ucl_object_t *cache_obj = NULL, *cache_name_obj;
+ const gchar *cache_name = NULL;
if (stat_ctx == NULL) {
stat_ctx = g_slice_alloc0 (sizeof (*stat_ctx));
}
- stat_ctx->backends = stat_backends;
+ stat_ctx->backends_subrs = stat_backends;
stat_ctx->backends_count = G_N_ELEMENTS (stat_backends);
- stat_ctx->classifiers = stat_classifiers;
+ stat_ctx->classifiers_subrs = stat_classifiers;
stat_ctx->classifiers_count = G_N_ELEMENTS (stat_classifiers);
- stat_ctx->tokenizers = stat_tokenizers;
+ stat_ctx->tokenizers_subrs = stat_tokenizers;
stat_ctx->tokenizers_count = G_N_ELEMENTS (stat_tokenizers);
- stat_ctx->caches = stat_caches;
+ stat_ctx->caches_subrs = stat_caches;
stat_ctx->caches_count = G_N_ELEMENTS (stat_caches);
stat_ctx->cfg = cfg;
+ stat_ctx->statfiles = g_ptr_array_new ();
+ stat_ctx->classifiers = g_ptr_array_new ();
+ REF_RETAIN (stat_ctx->cfg);
- /* Init backends */
- for (i = 0; i < stat_ctx->backends_count; i ++) {
- stat_ctx->backends[i].ctx = stat_ctx->backends[i].init (stat_ctx, cfg);
- msg_debug_config ("added backend %s", stat_ctx->backends[i].name);
- }
+ /* Create statfiles from the classifiers */
+ cur = cfg->classifiers;
+
+ while (cur) {
+ clf = cur->data;
+ bk = rspamd_stat_get_backend (clf->backend);
+ g_assert (bk != NULL);
+
+ /* XXX:
+ * Here we get the first classifier tokenizer config as the only one
+ * We NO LONGER support multiple tokenizers per rspamd instance
+ */
+ if (stat_ctx->tkcf == NULL) {
+ stat_ctx->tokenizer = rspamd_stat_get_tokenizer (clf->tokenizer->name);
+ g_assert (stat_ctx->tokenizer != NULL);
+ stat_ctx->tkcf = stat_ctx->tokenizer->get_config (cfg->cfg_pool,
+ clf->tokenizer, NULL);
+ }
+
+ cl = g_slice_alloc0 (sizeof (*cl));
+ cl->cfg = clf;
+ cl->ctx = stat_ctx;
+ cl->statfiles_ids = g_array_new (FALSE, FALSE, sizeof (gint));
+ cl->subrs = rspamd_stat_get_classifier (clf->classifier);
+ g_assert (cl->subrs != NULL);
+ cl->subrs->init_func (cfg->cfg_pool, cl);
+
+ /* Init classifier cache */
+ if (clf->opts) {
+ cache_obj = ucl_object_find_key (clf->opts, "cache");
+
+ if (cache_obj) {
+ cache_name_obj = ucl_object_find_key (cache_obj, "name");
+ }
+
+ if (cache_name_obj) {
+ cache_name = ucl_object_tostring (cache_name_obj);
+ }
+ }
+
+ cl->cache = rspamd_stat_get_cache (cache_name);
+ g_assert (cl->cache != NULL);
+ cl->cachecf = cl->cache->init (stat_ctx, cfg, cache_obj);
+
+ curst = clf->statfiles;
+
+ while (curst) {
+ stf = curst->data;
+ st = g_slice_alloc0 (sizeof (*st));
+ st->classifier = cl;
+ st->stcf = stf;
+ st->backend = bk;
+ st->bkcf = bk->init (stat_ctx, cfg, st);
+ msg_debug_config ("added backend %s for symbol %s",
+ bk->name, stf->symbol);
+
+ if (st->bkcf == NULL) {
+ msg_err_config ("cannot init backend %s for statfile %s",
+ clf->backend, stf->symbol);
+
+ g_slice_free1 (sizeof (*st), st);
+ }
+ else {
+ st->id = stat_ctx->statfiles->len;
+ g_ptr_array_add (stat_ctx->statfiles, st);
+ g_array_append_val (cl->statfiles_ids, st->id);
+ }
+
+ curst = curst->next;
+ }
+
+ g_ptr_array_add (stat_ctx->classifiers, cl);
- /* Init caches */
- for (i = 0; i < stat_ctx->caches_count; i ++) {
- stat_ctx->caches[i].ctx = stat_ctx->caches[i].init (stat_ctx, cfg);
- msg_debug_config ("added cache %s", stat_ctx->caches[i].name);
+ cur = cur->next;
}
}
@@ -129,12 +197,9 @@ rspamd_stat_close (void)
g_assert (stat_ctx != NULL);
- for (i = 0; i < stat_ctx->backends_count; i ++) {
- if (stat_ctx->backends[i].close != NULL) {
- stat_ctx->backends[i].close (stat_ctx->backends[i].ctx);
- msg_debug_config ("closed backend %s", stat_ctx->backends[i].name);
- }
- }
+ /* TODO: add cleanup routine */
+
+ REF_RELEASE (stat_ctx->cfg);
}
struct rspamd_stat_ctx *
@@ -148,9 +213,13 @@ rspamd_stat_get_classifier (const gchar *name)
{
guint i;
+ if (name == NULL || name[0] == '\0') {
+ name = RSPAMD_DEFAULT_CLASSIFIER;
+ }
+
for (i = 0; i < stat_ctx->classifiers_count; i ++) {
- if (strcmp (name, stat_ctx->classifiers[i].name) == 0) {
- return &stat_ctx->classifiers[i];
+ if (strcmp (name, stat_ctx->classifiers_subrs[i].name) == 0) {
+ return &stat_ctx->classifiers_subrs[i];
}
}
@@ -167,8 +236,8 @@ rspamd_stat_get_backend (const gchar *name)
}
for (i = 0; i < stat_ctx->backends_count; i ++) {
- if (strcmp (name, stat_ctx->backends[i].name) == 0) {
- return &stat_ctx->backends[i];
+ if (strcmp (name, stat_ctx->backends_subrs[i].name) == 0) {
+ return &stat_ctx->backends_subrs[i];
}
}
@@ -185,8 +254,26 @@ rspamd_stat_get_tokenizer (const gchar *name)
}
for (i = 0; i < stat_ctx->tokenizers_count; i ++) {
- if (strcmp (name, stat_ctx->tokenizers[i].name) == 0) {
- return &stat_ctx->tokenizers[i];
+ if (strcmp (name, stat_ctx->tokenizers_subrs[i].name) == 0) {
+ return &stat_ctx->tokenizers_subrs[i];
+ }
+ }
+
+ return NULL;
+}
+
+struct rspamd_stat_cache *
+rspamd_stat_get_cache (const gchar *name)
+{
+ guint i;
+
+ if (name == NULL || name[0] == '\0') {
+ name = RSPAMD_DEFAULT_CACHE;
+ }
+
+ for (i = 0; i < stat_ctx->caches_count; i++) {
+ if (strcmp (name, stat_ctx->caches_subrs[i].name) == 0) {
+ return &stat_ctx->caches_subrs[i];
}
}
diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h
index 640196788..591c47381 100644
--- a/src/libstat/stat_internal.h
+++ b/src/libstat/stat_internal.h
@@ -30,20 +30,6 @@
#include "backends/backends.h"
#include "learn_cache/learn_cache.h"
-enum stat_process_stage {
- RSPAMD_STAT_STAGE_PRE = 0,
- RSPAMD_STAT_STAGE_POST
-};
-
-struct rspamd_tokenizer_runtime {
- GTree *tokens;
- const gchar *name;
- struct rspamd_stat_tokenizer *tokenizer;
- struct rspamd_tokenizer_config *tkcf;
- gpointer config;
- gsize conf_len;
-};
-
struct rspamd_statfile_runtime {
struct rspamd_statfile_config *st;
gpointer backend_runtime;
@@ -51,28 +37,24 @@ struct rspamd_statfile_runtime {
guint64 total_hits;
};
-struct rspamd_classifier_runtime {
- struct rspamd_classifier_config *clcf;
- struct classifier_ctx *clctx;
- struct rspamd_stat_classifier *cl;
- struct rspamd_stat_backend *backend;
- struct rspamd_tokenizer_runtime *tok;
- double ham_prob;
- double spam_prob;
- enum stat_process_stage stage;
- guint64 total_spam;
- guint64 total_ham;
- guint64 processed_tokens;
- GList *st_runtime;
- guint start_pos;
- guint end_pos;
- gboolean skipped;
+/* Common classifier structure */
+struct rspamd_classifier {
+ struct rspamd_stat_ctx *ctx;
+ struct rspamd_stat_cache *cache;
+ gpointer cachecf;
+ GArray *statfiles_ids;
+ gulong spam_learns;
+ gulong ham_learns;
+ struct rspamd_classifier_config *cfg;
+ struct rspamd_stat_classifier *subrs;
};
-struct rspamd_token_result {
- double value;
- struct rspamd_statfile_runtime *st_runtime;
- struct rspamd_classifier_runtime *cl_runtime;
+struct rspamd_statfile {
+ gint id;
+ struct rspamd_statfile_config *stcf;
+ struct rspamd_classifier *classifier;
+ struct rspamd_stat_backend *backend;
+ gpointer bkcf;
};
#define RSPAMD_MAX_TOKEN_LEN 16
@@ -80,21 +62,27 @@ typedef struct token_node_s {
guchar data[RSPAMD_MAX_TOKEN_LEN];
guint window_idx;
guint datalen;
- GArray *results;
+ gdouble values[];
} rspamd_token_t;
struct rspamd_stat_ctx {
- struct rspamd_stat_classifier *classifiers;
+ /* Subroutines for all objects */
+ struct rspamd_stat_classifier *classifiers_subrs;
guint classifiers_count;
- struct rspamd_stat_tokenizer *tokenizers;
+ struct rspamd_stat_tokenizer *tokenizers_subrs;
guint tokenizers_count;
- struct rspamd_stat_backend *backends;
+ struct rspamd_stat_backend *backends_subrs;
guint backends_count;
- struct rspamd_stat_cache *caches;
+ struct rspamd_stat_cache *caches_subrs;
guint caches_count;
- guint statfiles;
+ /* Runtime configuration */
+ GPtrArray *statfiles; /* struct rspamd_statfile */
+ GPtrArray *classifiers; /* struct rspamd_classifier */
struct rspamd_config *cfg;
+ /* Global tokenizer */
+ struct rspamd_stat_tokenizer *tokenizer;
+ gpointer tkcf;
};
typedef enum rspamd_learn_cache_result {
@@ -107,6 +95,7 @@ struct rspamd_stat_ctx * rspamd_stat_get_ctx (void);
struct rspamd_stat_classifier * rspamd_stat_get_classifier (const gchar *name);
struct rspamd_stat_backend * rspamd_stat_get_backend (const gchar *name);
struct rspamd_stat_tokenizer * rspamd_stat_get_tokenizer (const gchar *name);
+struct rspamd_stat_cache * rspamd_stat_get_cache (const gchar *name);
static GQuark rspamd_stat_quark (void)
{
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 8bdf394b1..8a4269727 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2015, Vsevolod Stakhov
+/* Copyright (c) 2015-2016, Vsevolod Stakhov
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -37,18 +37,8 @@
static const gint similarity_treshold = 80;
-struct preprocess_cb_data {
- struct rspamd_task *task;
- GList *classifier_runtimes;
- struct rspamd_tokenizer_runtime *tok;
- guint results_count;
- gboolean unlearn;
- gboolean spam;
-};
-
static void
rspamd_stat_tokenize_header (struct rspamd_task *task,
- struct rspamd_tokenizer_runtime *tok,
const gchar *name, const gchar *prefix, GArray *ar)
{
struct raw_header *rh, *cur;
@@ -81,8 +71,8 @@ rspamd_stat_tokenize_header (struct rspamd_task *task,
}
static void
-rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task,
- struct rspamd_tokenizer_runtime *tok)
+rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
+ struct rspamd_task *task)
{
struct rspamd_image *img;
struct mime_part *part;
@@ -164,16 +154,17 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task,
cur = g_list_first (task->cfg->classify_headers);
while (cur) {
- rspamd_stat_tokenize_header (task, tok, cur->data, "UA:", ar);
+ rspamd_stat_tokenize_header (task, cur->data, "UA:", ar);
cur = g_list_next (cur);
}
- tok->tokenizer->tokenize_func (tok,
+ st_ctx->tokenizer->tokenize_func (st_ctx,
task->task_pool,
ar,
TRUE,
- "META:");
+ "META:",
+ task->tokens);
g_array_free (ar, TRUE);
}
@@ -183,24 +174,36 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task,
*/
static void
rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
- struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok)
+ struct rspamd_task *task)
{
struct mime_text_part *part;
GArray *words;
gchar *sub;
- guint i;
+ guint i, reserved_len = 0;
gint *pdiff;
- gboolean compat;
- compat = tok->tokenizer->is_compat (tok);
+ for (i = 0; i < task->text_parts->len; i++) {
+ part = g_ptr_array_index (task->text_parts, i);
+
+ if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+ reserved_len += part->normalized_words->len;
+ }
+ /* XXX: normal window size */
+ reserved_len += 5;
+ }
+
+ task->tokens = g_ptr_array_sized_new (reserved_len);
+ rspamd_mempool_add_destructor (task->task_pool,
+ rspamd_ptr_array_free_hard, task->tokens);
pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
- tok->tokenizer->tokenize_func (tok, task->task_pool,
- part->normalized_words, IS_PART_UTF (part), NULL);
+ st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
+ part->normalized_words, IS_PART_UTF (part),
+ NULL, task->tokens);
}
@@ -219,324 +222,118 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat,
+ words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE,
NULL);
if (words != NULL) {
- tok->tokenizer->tokenize_func (tok,
+ st_ctx->tokenizer->tokenize_func (st_ctx,
task->task_pool,
words,
TRUE,
- "SUBJECT");
+ "SUBJECT",
+ task->tokens);
g_array_free (words, TRUE);
}
}
- rspamd_stat_tokenize_parts_metadata (task, tok);
+ rspamd_stat_tokenize_parts_metadata (st_ctx, task);
}
-static struct rspamd_tokenizer_runtime *
-rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf,
- struct rspamd_stat_ctx *st_ctx,
- struct rspamd_task *task,
- struct rspamd_classifier_runtime *cl_runtime,
- gpointer conf, gsize conf_len)
+static void
+rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
+ struct rspamd_task *task, gboolean learn)
{
- struct rspamd_tokenizer_runtime *tok = NULL;
- const gchar *name;
-
- if (cf == NULL || cf->name == NULL) {
- name = RSPAMD_DEFAULT_TOKENIZER;
- cf->name = name;
- }
- else {
- name = cf->name;
- }
-
- tok = rspamd_mempool_alloc (task->task_pool, sizeof (*tok));
- tok->tokenizer = rspamd_stat_get_tokenizer (name);
- tok->tkcf = cf;
-
- if (tok->tokenizer == NULL) {
- return NULL;
- }
-
- if (!tok->tokenizer->load_config (task->task_pool, tok, conf, conf_len)) {
- return NULL;
- }
+ guint i;
+ struct rspamd_statfile *st;
+ gpointer bk_run;
- tok->tokens = g_tree_new (token_node_compare_func);
+ rspamd_stat_process_tokenize (st_ctx, task);
+ task->stat_runtimes = g_ptr_array_sized_new (st_ctx->statfiles->len);
rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens);
- tok->name = name;
- rspamd_stat_process_tokenize (st_ctx, task, tok);
- cl_runtime->tok = tok;
-
- return tok;
-}
-
-static gboolean
-preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
-{
- rspamd_token_t *t = (rspamd_token_t *)v;
- struct preprocess_cb_data *cbdata = (struct preprocess_cb_data *)d;
- struct rspamd_statfile_runtime *st_runtime;
- struct rspamd_classifier_runtime *cl_runtime;
- struct rspamd_token_result *res;
- GList *cur, *curst;
- struct rspamd_task *task;
- gint i = 0;
-
- task = cbdata->task;
- t->results = g_array_sized_new (FALSE, TRUE,
- sizeof (struct rspamd_token_result), cbdata->results_count);
- g_array_set_size (t->results, cbdata->results_count);
- rspamd_mempool_add_destructor (cbdata->task->task_pool,
- rspamd_array_free_hard, t->results);
-
- cur = g_list_first (cbdata->classifier_runtimes);
-
- while (cur) {
- cl_runtime = (struct rspamd_classifier_runtime *)cur->data;
-
- if (cl_runtime->clcf->min_tokens > 0 &&
- (guint32)g_tree_nnodes (cbdata->tok->tokens) < cl_runtime->clcf->min_tokens) {
- /* Skip this classifier */
- cur = g_list_next (cur);
- cl_runtime->skipped = TRUE;
- continue;
- }
+ rspamd_ptr_array_free_hard, task->stat_runtimes);
- curst = cl_runtime->st_runtime;
-
- while (curst) {
+ for (i = 0; i < st_ctx->statfiles->len; i ++) {
+ st = g_ptr_array_index (st_ctx->statfiles, i);
+ g_assert (st != NULL);
- st_runtime = (struct rspamd_statfile_runtime *)curst->data;
- res = &g_array_index (t->results, struct rspamd_token_result, i);
- res->cl_runtime = cl_runtime;
- res->st_runtime = st_runtime;
-
- if (cl_runtime->backend->process_token (cbdata->task, t, res,
- cl_runtime->backend->ctx)) {
+ bk_run = st->backend->runtime (task, st->stcf, learn, st->bkcf);
- if (cl_runtime->clcf->max_tokens > 0 &&
- cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) {
- msg_debug_task ("message contains more tokens than allowed for %s classifier: "
- "%uL > %ud", cl_runtime->clcf->name,
- cl_runtime->processed_tokens,
- cl_runtime->clcf->max_tokens);
-
- return TRUE;
- }
- }
-
- i ++;
- curst = g_list_next (curst);
+ if (bk_run == NULL) {
+ msg_err_task ("cannot init backend %s for statfile %s",
+ st->backend->name, st->stcf->symbol);
}
- cur = g_list_next (cur);
+ g_ptr_array_add (task->stat_runtimes, bk_run);
}
-
-
- return FALSE;
}
-static GList*
-rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
- struct rspamd_task *task,
- lua_State *L,
- gint op,
- gboolean spam,
- const gchar *classifier,
- GError **err)
+static void
+rspamd_stat_backends_process (struct rspamd_stat_ctx *st_ctx,
+ struct rspamd_task *task)
{
- struct rspamd_classifier_config *clcf;
- struct rspamd_statfile_config *stcf;
- struct rspamd_classifier_runtime *cl_runtime;
- struct rspamd_statfile_runtime *st_runtime;
- struct rspamd_stat_backend *bk;
- gpointer backend_runtime, tok_config;
- GList *cur, *st_list = NULL, *curst;
- GList *cl_runtimes = NULL;
- guint result_size = 0, start_pos = 0, end_pos = 0;
- gsize conf_len;
- struct preprocess_cb_data cbdata;
-
- cur = g_list_first (task->cfg->classifiers);
-
- while (cur) {
- clcf = (struct rspamd_classifier_config *)cur->data;
- st_list = NULL;
-
- if (classifier != NULL &&
- (clcf->name == NULL || strcmp (clcf->name, classifier) != 0)) {
- /* Skip this classifier */
- msg_debug_task ("skip classifier %s, as we are requested to check %s only",
- clcf->name, classifier);
- cur = g_list_next (cur);
- continue;
- }
-
- if (clcf->pre_callbacks != NULL) {
- st_list = rspamd_lua_call_cls_pre_callbacks (clcf, task, FALSE,
- FALSE, L);
- }
- if (st_list != NULL) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, st_list);
- }
- else {
- st_list = clcf->statfiles;
- }
-
- /* Now init runtime values */
- cl_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cl_runtime));
- cl_runtime->cl = rspamd_stat_get_classifier (clcf->classifier);
-
- if (cl_runtime->cl == NULL) {
- g_set_error (err, rspamd_stat_quark(), 500,
- "classifier %s is not defined", clcf->classifier);
- g_list_free (cl_runtimes);
- return NULL;
- }
-
- cl_runtime->clcf = clcf;
-
- bk = rspamd_stat_get_backend (clcf->backend);
- if (bk == NULL) {
- g_set_error (err, rspamd_stat_quark(), 500,
- "backend %s is not defined", clcf->backend);
- g_list_free (cl_runtimes);
- return NULL;
- }
-
- cl_runtime->backend = bk;
-
- curst = st_list;
- while (curst != NULL) {
- stcf = (struct rspamd_statfile_config *)curst->data;
-
- /* On learning skip statfiles that do not belong to class */
- if (op == RSPAMD_LEARN_OP && (spam != stcf->is_spam)) {
- curst = g_list_next (curst);
- continue;
- }
-
- backend_runtime = bk->runtime (task, stcf, op != RSPAMD_CLASSIFY_OP,
- bk->ctx);
-
- if (backend_runtime == NULL) {
- if (op != RSPAMD_CLASSIFY_OP) {
- /* Assume backend absence as fatal error */
- g_set_error (err, rspamd_stat_quark(), 500,
- "cannot open backend for statfile %s", stcf->symbol);
- g_list_free (cl_runtimes);
-
- return NULL;
- }
- else {
- /* Just skip this element */
- msg_warn ("backend of type %s does not exist: %s",
- clcf->backend, stcf->symbol);
- curst = g_list_next (curst);
- continue;
- }
- }
-
- tok_config = bk->load_tokenizer_config (backend_runtime,
- &conf_len);
-
- if (cl_runtime->tok == NULL) {
- cl_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer,
- st_ctx, task, cl_runtime, tok_config, conf_len);
-
- if (cl_runtime->tok == NULL) {
- g_set_error (err, rspamd_stat_quark(), 500,
- "cannot initialize tokenizer for statfile %s", stcf->symbol);
- g_list_free (cl_runtimes);
-
- return NULL;
- }
- }
+ guint i;
+ struct rspamd_statfile *st;
+ struct rspamd_classifier *cl;
+ gpointer bk_run;
- if (!cl_runtime->tok->tokenizer->compatible_config (
- cl_runtime->tok, tok_config, conf_len)) {
- g_set_error (err, rspamd_stat_quark(), 500,
- "incompatible tokenizer for statfile %s", stcf->symbol);
- g_list_free (cl_runtimes);
+ g_assert (task->stat_runtimes != NULL);
- return NULL;
- }
+ for (i = 0; i < st_ctx->statfiles->len; i++) {
+ st = g_ptr_array_index (st_ctx->statfiles, i);
+ bk_run = g_ptr_array_index (task->stat_runtimes, i);
+ cl = st->classifier;
+ g_assert (st != NULL);
- st_runtime = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (*st_runtime));
- st_runtime->st = stcf;
- st_runtime->backend_runtime = backend_runtime;
+ if (bk_run != NULL) {
+ st->backend->process_tokens (task, task->tokens, i, bk_run);
- if (stcf->is_spam) {
- cl_runtime->total_spam += bk->total_learns (task, backend_runtime,
- bk->ctx);
+ if (st->stcf->is_spam) {
+ cl->spam_learns = st->backend->total_learns (task,
+ bk_run,
+ st_ctx);
}
else {
- cl_runtime->total_ham += bk->total_learns (task, backend_runtime,
- bk->ctx);
+ cl->ham_learns = st->backend->total_learns (task,
+ bk_run,
+ st_ctx);
}
-
- cl_runtime->st_runtime = g_list_prepend (cl_runtime->st_runtime,
- st_runtime);
- result_size ++;
-
- curst = g_list_next (curst);
- end_pos ++;
- }
-
- if (cl_runtime->st_runtime != NULL) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free,
- cl_runtime->st_runtime);
- cl_runtimes = g_list_prepend (cl_runtimes, cl_runtime);
}
+ }
+}
- /* Set positions in the results array */
- cl_runtime->start_pos = start_pos;
- cl_runtime->end_pos = end_pos;
+static void
+rspamd_stat_backends_post_process (struct rspamd_stat_ctx *st_ctx,
+ struct rspamd_task *task)
+{
+ guint i;
+ struct rspamd_statfile *st;
+ gpointer bk_run;
- msg_debug_task ("added runtime for %s classifier from %ud to %ud",
- clcf->name, start_pos, end_pos);
+ g_assert (task->stat_runtimes != NULL);
- start_pos = end_pos;
+ for (i = 0; i < st_ctx->statfiles->len; i++) {
+ st = g_ptr_array_index (st_ctx->statfiles, i);
+ bk_run = g_ptr_array_index (task->stat_runtimes, i);
+ g_assert (st != NULL);
- /* Next classifier */
- cur = g_list_next (cur);
+ if (bk_run != NULL) {
+ st->backend->finalize_process (task, bk_run, st_ctx);
+ }
}
+}
- if (cl_runtimes != NULL) {
- /* Reverse list as we have used g_list_prepend */
- cl_runtimes = g_list_reverse (cl_runtimes);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free,
- cl_runtimes);
- cur = g_list_first (cl_runtimes);
-
- while (cur) {
- cl_runtime = cur->data;
+static void
+rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
+ struct rspamd_task *task)
+{
+ guint i;
+ struct rspamd_classifier *cl;
- cbdata.results_count = result_size;
- cbdata.classifier_runtimes = cl_runtimes;
- cbdata.task = task;
- cbdata.tok = cl_runtime->tok;
- g_tree_foreach (cbdata.tok->tokens, preprocess_init_stat_token,
- &cbdata);
+ for (i = 0; i < st_ctx->classifiers->len; i++) {
+ cl = g_ptr_array_index (st_ctx->classifiers, i);
+ g_assert (cl != NULL);
- cur = g_list_next (cur);
- }
- }
- else if (classifier != NULL) {
- /* We likely cannot find any classifier with this name */
- g_set_error (err, rspamd_stat_quark (), 404,
- "cannot find classifier %s", classifier);
+ cl->subrs->classify_func (cl, task->tokens, task);
}
-
- return cl_runtimes;
}
rspamd_stat_result_t
@@ -544,102 +341,30 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, guint stage,
GError **err)
{
struct rspamd_stat_ctx *st_ctx;
- struct rspamd_statfile_runtime *st_run;
- struct rspamd_classifier_runtime *cl_run;
- GList *cl_runtimes;
- GList *cur, *curst;
- gboolean ret = RSPAMD_STAT_PROCESS_OK;
+ rspamd_stat_result_t ret = RSPAMD_STAT_PROCESS_OK;
st_ctx = rspamd_stat_get_ctx ();
g_assert (st_ctx != NULL);
- cl_runtimes = task->cl_runtimes;
- if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) {
- /* Initialize classifiers and statfiles runtime */
- if (task->cl_runtimes == NULL) {
- if ((cl_runtimes = rspamd_stat_preprocess (st_ctx, task, L,
- RSPAMD_CLASSIFY_OP, FALSE, NULL, err)) == NULL) {
- return RSPAMD_STAT_PROCESS_OK;
- }
-
- task->cl_runtimes = cl_runtimes;
- cur = cl_runtimes;
- /* Finalize backend so it can load tokens delayed if needed */
- while (cur) {
- cl_run = (struct rspamd_classifier_runtime *) cur->data;
- curst = cl_run->st_runtime;
-
- while (curst) {
- st_run = curst->data;
- cl_run->backend->finalize_process (task,
- st_run->backend_runtime,
- cl_run->backend->ctx);
- curst = g_list_next (curst);
- }
-
- cur = g_list_next (cur);
- }
- }
+ if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) {
+ /* Preprocess tokens */
+ rspamd_stat_preprocess (st_ctx, task, FALSE);
}
else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS) {
- cur = cl_runtimes;
-
- /* The first stage of classification */
- while (cur) {
- cl_run = (struct rspamd_classifier_runtime *) cur->data;
- cl_run->stage = RSPAMD_STAT_STAGE_PRE;
-
- if (cl_run->cl) {
- cl_run->clctx = cl_run->cl->init_func (task->task_pool,
- cl_run->clcf);
-
- if (cl_run->clctx != NULL) {
- cl_run->cl->classify_func (cl_run->clctx, cl_run->tok->tokens,
- cl_run, task);
- }
- }
-
- cur = g_list_next (cur);
- }
+ /* Process backends */
+ rspamd_stat_backends_process (st_ctx, task);
}
else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_POST) {
- cur = cl_runtimes;
- /* The second stage of classification */
- while (cur) {
- cl_run = (struct rspamd_classifier_runtime *) cur->data;
- cl_run->stage = RSPAMD_STAT_STAGE_POST;
-
- if (cl_run->skipped) {
- cur = g_list_next (cur);
- continue;
- }
-
- cl_run = (struct rspamd_classifier_runtime *) cur->data;
- cl_run->stage = RSPAMD_STAT_STAGE_POST;
-
- if (cl_run->skipped) {
- cur = g_list_next (cur);
- continue;
- }
-
- if (cl_run->cl) {
- if (cl_run->clctx != NULL) {
- if (cl_run->cl->classify_func (cl_run->clctx,
- cl_run->tok->tokens,
- cl_run, task)) {
- ret = RSPAMD_STAT_PROCESS_OK;
- }
- }
- }
-
- cur = g_list_next (cur);
- }
+ /* Process classifiers */
+ rspamd_stat_backends_post_process (st_ctx, task);
+ rspamd_stat_classifiers_process (st_ctx, task);
}
return ret;
}
+#if 0
static gboolean
rspamd_stat_learn_token (gpointer k, gpointer v, gpointer d)
{
@@ -910,3 +635,26 @@ rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task,
return RSPAMD_STAT_PROCESS_OK;
}
+#else
+/* TODO: finish learning */
+rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task,
+ gboolean spam, lua_State *L, const gchar *classifier,
+ GError **err)
+{
+ return RSPAMD_STAT_PROCESS_ERROR;
+}
+
+/**
+ * Get the overall statistics for all statfile backends
+ * @param cfg configuration
+ * @param total_learns the total number of learns is stored here
+ * @return array of statistical information
+ */
+rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task,
+ struct rspamd_config *cfg,
+ guint64 *total_learns,
+ ucl_object_t **res)
+{
+ return RSPAMD_STAT_PROCESS_ERROR;
+}
+#endif
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 2d1b3bb3e..55a0c6bba 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -189,6 +189,7 @@ rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
return osb_cf;
}
+#if 0
gboolean
rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
gpointer ptr, gsize len)
@@ -223,28 +224,68 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
return ret;
}
+gboolean
+rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
+ struct rspamd_tokenizer_runtime *rt,
+ gpointer ptr, gsize len)
+{
+ struct rspamd_osb_tokenizer_config *osb_cf;
+
+ if (ptr == NULL || len == 0) {
+ osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
+
+ if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
+ /* Trying to load incompatible configuration */
+ msg_err_pool ("cannot load tokenizer configuration from a legacy "
+ "statfile; maybe you have forgotten to set 'compat' option"
+ " in the tokenizer configuration");
+
+ return FALSE;
+ }
+ }
+ else {
+ g_assert (len == sizeof (*osb_cf));
+ osb_cf = ptr;
+ }
+
+ rt->config = osb_cf;
+ rt->conf_len = sizeof (*osb_cf);
+
+ return TRUE;
+}
+
+gboolean
+rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
+{
+ struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
+
+ return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
+}
+#endif
+
+
+
gint
-rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
- rspamd_mempool_t * pool,
- GArray * input,
- gboolean is_utf,
- const gchar *prefix)
+rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result)
{
- rspamd_token_t *new = NULL;
+ rspamd_token_t *new_tok = NULL;
rspamd_ftok_t *token;
struct rspamd_osb_tokenizer_config *osb_cf;
guint64 *hashpipe, cur, seed;
guint32 h1, h2;
+ gsize token_size;
guint processed = 0, i, w, window_size;
- GTree *tree = rt->tokens;
-
- g_assert (tree != NULL);
- if (input == NULL) {
+ if (words == NULL) {
return FALSE;
}
- osb_cf = rt->config;
+ osb_cf = ctx->tkcf;
window_size = osb_cf->window_size;
if (prefix) {
@@ -256,9 +297,11 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
+ token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len;
+ g_assert (token_size > 0);
- for (w = 0; w < input->len; w ++) {
- token = &g_array_index (input, rspamd_ftok_t, w);
+ for (w = 0; w < words->len; w ++) {
+ token = &g_array_index (words, rspamd_ftok_t, w);
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
cur = rspamd_fstrhash_lc (token, is_utf);
@@ -278,6 +321,25 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
}
}
+#define ADD_TOKEN do {\
+ new_tok = rspamd_mempool_alloc0 (pool, token_size); \
+ new_tok->datalen = sizeof (gint64); \
+ if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
+ h1 = ((guint32)hashpipe[0]) * primes[0] + \
+ ((guint32)hashpipe[i]) * primes[i << 1]; \
+ h2 = ((guint32)hashpipe[0]) * primes[1] + \
+ ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; \
+ memcpy(new_tok->data, &h1, sizeof (h1)); \
+ memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \
+ } \
+ else { \
+ cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; \
+ memcpy (new_tok->data, &cur, sizeof (cur)); \
+ } \
+ new_tok->window_idx = i + 1; \
+ g_ptr_array_add (result, new_tok); \
+ } while(0)
+
if (processed < window_size) {
/* Just fill a hashpipe */
hashpipe[window_size - ++processed] = cur;
@@ -291,97 +353,20 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
processed++;
for (i = 1; i < window_size; i++) {
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
- new->datalen = sizeof (gint64);
-
- if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
- h1 = ((guint32)hashpipe[0]) * primes[0] +
- ((guint32)hashpipe[i]) * primes[i << 1];
- h2 = ((guint32)hashpipe[0]) * primes[1] +
- ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
-
- memcpy(new->data, &h1, sizeof (h1));
- memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
- }
- else {
- cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- memcpy (new->data, &cur, sizeof (cur));
- }
-
- new->window_idx = i + 1;
-
- if (g_tree_lookup (tree, new) == NULL) {
- g_tree_insert (tree, new, new);
- }
+ ADD_TOKEN;
}
}
}
if (processed <= window_size) {
memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
- for (i = 1; i < processed; i++) {
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
- new->datalen = sizeof (gint64);
-
- if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
- h1 = ((guint32)hashpipe[0]) * primes[0] +
- ((guint32)hashpipe[i]) * primes[i << 1];
- h2 = ((guint32)hashpipe[0]) * primes[1] +
- ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
- memcpy(new->data, &h1, sizeof (h1));
- memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
- }
- else {
- cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- memcpy (new->data, &cur, sizeof (cur));
- }
-
- new->window_idx = i + 1;
-
- if (g_tree_lookup (tree, new) == NULL) {
- g_tree_insert (tree, new, new);
- }
- }
- }
-
- return TRUE;
-}
-
-
-gboolean
-rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len)
-{
- struct rspamd_osb_tokenizer_config *osb_cf;
- if (ptr == NULL || len == 0) {
- osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
-
- if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
- /* Trying to load incompatible configuration */
- msg_err_pool ("cannot load tokenizer configuration from a legacy "
- "statfile; maybe you have forgotten to set 'compat' option"
- " in the tokenizer configuration");
-
- return FALSE;
+ for (i = 1; i < processed; i++) {
+ ADD_TOKEN;
}
}
- else {
- g_assert (len == sizeof (*osb_cf));
- osb_cf = ptr;
- }
- rt->config = osb_cf;
- rt->conf_len = sizeof (*osb_cf);
+#undef ADD_TOKEN
return TRUE;
}
-
-gboolean
-rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
-{
- struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
-
- return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
-}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index f4c9a5ed3..70ff7560c 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -10,23 +10,19 @@
#define RSPAMD_DEFAULT_TOKENIZER "osb"
struct rspamd_tokenizer_runtime;
+struct rspamd_stat_ctx;
/* Common tokenizer structure */
struct rspamd_stat_tokenizer {
gchar *name;
gpointer (*get_config) (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf, gsize *len);
- gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
- gboolean (*load_config) (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
- gboolean (*is_compat) (struct rspamd_tokenizer_runtime *rt);
- gint (*tokenize_func)(struct rspamd_tokenizer_runtime *rt,
+ gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
rspamd_mempool_t *pool,
GArray *words,
gboolean is_utf,
- const gchar *prefix);
+ const gchar *prefix,
+ GPtrArray *result);
};
/* Compare two token nodes */
@@ -39,28 +35,17 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
guint64 *hash);
/* OSB tokenize function */
-gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
- rspamd_mempool_t *pool,
- GArray *input,
- gboolean is_utf,
- const gchar *prefix);
+gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,
gsize *len);
-gboolean
-rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
-
-gboolean
-rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
-
-gboolean
-rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt);
-
#endif
/*
* vi:ts=4