From: Vsevolod Stakhov Date: Fri, 13 Feb 2015 16:46:06 +0000 (+0000) Subject: Add sqlite3 learn cache. X-Git-Tag: 0.9.0~713 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f086958b4b6b71952bb47e422aebaaa00c3143ea;p=rspamd.git Add sqlite3 learn cache. --- diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt index 17c553989..5c4e2bc74 100644 --- a/src/libstat/CMakeLists.txt +++ b/src/libstat/CMakeLists.txt @@ -8,16 +8,20 @@ SET(TOKENIZERSSRC tokenizers/tokenizers.c SET(CLASSIFIERSSRC classifiers/bayes.c) SET(BACKENDSSRC backends/mmaped_file.c) + +SET(CACHESSRC learn_cache/sqlite3_cache.c) ADD_LIBRARY(rspamd-stat ${LINK_TYPE} ${LIBSTATSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC} - ${BACKENDSSRC}) + ${BACKENDSSRC} + ${CACHESSRC}) IF(NOT DEBIAN_BUILD) SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES VERSION ${RSPAMD_VERSION}) ENDIF(NOT DEBIAN_BUILD) SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES LINKER_LANGUAGE C COMPILE_FLAGS "-DRSPAMD_LIB") TARGET_LINK_LIBRARIES(rspamd-stat rspamd-server) +TARGET_LINK_LIBRARIES(rspamd-stat sqlite3) IF(CMAKE_COMPILER_IS_GNUCC) SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES COMPILE_FLAGS "-DRSPAMD_LIB -fno-strict-aliasing") diff --git a/src/libstat/learn_cache/learn_cache.h b/src/libstat/learn_cache/learn_cache.h index dd9240a27..bd01d1be1 100644 --- a/src/libstat/learn_cache/learn_cache.h +++ b/src/libstat/learn_cache/learn_cache.h @@ -34,10 +34,15 @@ typedef enum rspamd_learn_cache_result { RSPAMD_LEARN_INGORE } rspamd_learn_t; +struct rspamd_task; +struct rspamd_stat_ctx; +struct rspamd_config; + struct rspamd_stat_cache { const char *name; gpointer (*init)(struct rspamd_stat_ctx *ctx, struct rspamd_config *cfg); - rspamd_learn_t (*process)(GTree *input, gboolean is_spam, gpointer ctx); + rspamd_learn_t (*process)(struct rspamd_task *task, gboolean is_spam, + gpointer ctx); gpointer ctx; }; diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c new file mode 100644 index 000000000..ef6f005b3 --- /dev/null +++ b/src/libstat/learn_cache/sqlite3_cache.c @@ -0,0 +1,203 @@ +/* Copyright (c) 2015, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "learn_cache.h" +#include "main.h" +#include "stat_api.h" +#include "stat_internal.h" +#include "blake2.h" +#include "ucl.h" +#include "fstring.h" +#include "message.h" +#include + +const char *create_tables_sql = + "BEGIN;" + "CREATE TABLE IF NOT EXISTS learns(" + "id INTEGER PRIMARY KEY," + "flag INTEGER NOT NULL," + "digest TEXT NOT NULL);" + "CREATE UNIQUE INDEX IF NOT EXISTS d ON learns(digest);" + "COMMIT;"; + +#define SQLITE_CACHE_PATH RSPAMD_DBDIR "/learn_cache.sqlite" + +struct rspamd_stat_sqlite3_ctx { + sqlite3 *db; +}; + +gpointer +rspamd_stat_cache_sqlite3_init(struct rspamd_stat_ctx *ctx, + struct rspamd_config *cfg) +{ + struct rspamd_stat_sqlite3_ctx *new = NULL; + struct rspamd_classifier_config *clf; + const ucl_object_t *obj; + GList *cur; + sqlite3 *sqlite; + gboolean has_sqlite_cache = FALSE; + gint rc; + + cur = cfg->classifiers; + + while (cur) { + clf = cur->data; + + obj = ucl_object_find_key (clf->opts, "cache"); + + /* Sqlite3 cache is the default learn cache method */ + if (obj == NULL || g_ascii_strcasecmp (ucl_object_tostring (obj), + "sqlite3") == 0) { + has_sqlite_cache = TRUE; + break; + } + + cur = g_list_next (cur); + } + + if (has_sqlite_cache) { + if ((rc = sqlite3_open_v2 (SQLITE_CACHE_PATH, &sqlite, + SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_NOMUTEX, NULL)) + != SQLITE_OK) { + msg_err ("Cannot open sqlite db %s: %s", SQLITE_CACHE_PATH, + sqlite3_errstr (rc)); + + return NULL; + } + + if ((rc = sqlite3_exec (sqlite, create_tables_sql, NULL, NULL, NULL)) + != SQLITE_OK) { + sqlite3_close (sqlite); + msg_err ("Cannot initialize sqlite db %s: %s", SQLITE_CACHE_PATH, + sqlite3_errstr (rc)); + + return NULL; + } + + new = g_slice_alloc (sizeof (*new)); + new->db = sqlite; + } + + return new; +} + +static rspamd_learn_t +rspamd_stat_cache_sqlite3_check (const guchar *h, gsize len, gboolean is_spam, + struct rspamd_stat_sqlite3_ctx *ctx) +{ + static const gchar select_sql[] = "SELECT flag FROM learns WHERE digest=?1"; + static const gchar insert_sql[] = "INSERT INTO learns(digest, flag) VALUES " + "(?1, ?2);"; + static const gchar update_sql[] = "UPDATE learns SET flag=?1 WHERE digest=?2"; + sqlite3_stmt *st = NULL; + gint rc, ret = RSPAMD_LEARN_OK, flag; + + if ((rc = sqlite3_prepare_v2 (ctx->db, select_sql, + -1, &st, NULL)) != SQLITE_OK) { + msg_err ("Cannot prepare sql %s: %s", select_sql, sqlite3_errstr (rc)); + return RSPAMD_LEARN_OK; + } + + sqlite3_bind_text (st, 1, h, len, SQLITE_STATIC); + + rc = sqlite3_step (st); + + if (rc == SQLITE_ROW) { + /* We have some existing record in the table */ + flag = sqlite3_column_int (st, 0); + sqlite3_finalize (st); + + if ((flag && is_spam) || (!flag && !is_spam)) { + /* Already learned */ + ret = RSPAMD_LEARN_INGORE; + } + else { + /* Need to relearn */ + if ((rc = sqlite3_prepare_v2 (ctx->db, update_sql, + -1, &st, NULL)) != SQLITE_OK) { + msg_err ("Cannot prepare sql %s: %s", update_sql, + sqlite3_errstr (rc)); + } + else { + sqlite3_bind_int (st, 1, is_spam ? 1 : 0); + sqlite3_bind_text (st, 2, h, len, SQLITE_STATIC); + sqlite3_step (st); + sqlite3_finalize (st); + } + + return RSPAMD_LEARN_UNLEARN; + } + } + else { + /* Insert result new id */ + sqlite3_finalize (st); + if ((rc = sqlite3_prepare_v2 (ctx->db, insert_sql, + -1, &st, NULL)) != SQLITE_OK) { + msg_err ("Cannot prepare sql %s: %s", insert_sql, sqlite3_errstr (rc)); + } + else { + sqlite3_bind_text (st, 1, h, len, SQLITE_STATIC); + sqlite3_bind_int (st, 2, is_spam ? 1 : 0); + sqlite3_step (st); + sqlite3_finalize (st); + } + } + + return ret; +} + +rspamd_learn_t +rspamd_stat_cache_sqlite3_process(struct rspamd_task *task, + gboolean is_spam, gpointer c) +{ + struct rspamd_stat_sqlite3_ctx *ctx = (struct rspamd_stat_sqlite3_ctx *)c; + struct mime_text_part *part; + blake2b_state st; + rspamd_fstring_t *word; + guchar out[BLAKE2B_OUTBYTES]; + GList *cur; + guint i; + + if (ctx != NULL && ctx->db != NULL) { + blake2b_init (&st, sizeof (out)); + cur = task->text_parts; + + while (cur) { + part = (struct mime_text_part *)cur->data; + + for (i = 0; i < part->words->len; i ++) { + word = &g_array_index (part->words, rspamd_fstring_t, i); + blake2b_update (&st, word->begin, word->len); + } + + cur = g_list_next (cur); + } + + blake2b_final (&st, out, sizeof (out)); + + return rspamd_stat_cache_sqlite3_check (out, sizeof (out), is_spam, ctx); + } + + return RSPAMD_LEARN_OK; +}