From a80ca7dc671bc6afbc3769e14fa4db02f179a99c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 19 Oct 2021 09:53:33 +0100 Subject: [PATCH] [Project] Cdb: continue statistics backend implementation --- src/libstat/CMakeLists.txt | 11 +- src/libstat/backends/cdb_backend.cxx | 211 ++++++++++++++++++++++++++- 2 files changed, 214 insertions(+), 8 deletions(-) diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt index 83020a56d..19962239d 100644 --- a/src/libstat/CMakeLists.txt +++ b/src/libstat/CMakeLists.txt @@ -10,14 +10,11 @@ SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c - ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx) -SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c) - -SET(BACKENDSSRC ${BACKENDSSRC} - ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.c) -SET(CACHESSRC ${CACHESSRC} - ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/redis_cache.c) + ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.c) +SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c + ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/redis_cache.c) SET(RSPAMD_STAT ${LIBSTATSRC} ${TOKENIZERSSRC} diff --git a/src/libstat/backends/cdb_backend.cxx b/src/libstat/backends/cdb_backend.cxx index a36876a8f..899ad949a 100644 --- a/src/libstat/backends/cdb_backend.cxx +++ b/src/libstat/backends/cdb_backend.cxx @@ -22,8 +22,203 @@ #include "stat_internal.h" #include "contrib/cdb/cdb.h" +#include +#include +#include +#include +#include "contrib/expected/expected.hpp" +#include "fmt/core.h" + namespace rspamd::stat::cdb { +class ro_backend final { +public: + explicit ro_backend(struct rspamd_statfile *_st, std::unique_ptr &&_db) + : st(_st), db(std::move(_db)) {} + ro_backend() = delete; + ro_backend(const ro_backend &) = delete; + ro_backend(ro_backend &&other) noexcept { + *this = std::move(other); + } + ro_backend& operator=(ro_backend &&other) noexcept + { + std::swap(st, other.st); + std::swap(db, other.db); + + return *this; + } + ~ro_backend() { + if (db) { + // Might be worth to use unique ptr with a custom deleter + cdb_free(db.get()); + } + } + + auto load_cdb() -> tl::expected; + auto process_token(const rspamd_token_t *tok) const -> std::optional; +private: + struct rspamd_statfile *st; + std::unique_ptr db; + bool loaded = false; + std::uint64_t learns_spam = 0; + std::uint64_t learns_ham = 0; +}; + +template +static inline auto +cdb_get_key_as_double(struct cdb *cdb, T key) -> std::optional +{ + auto pos = cdb_find(cdb, (void *)&key, sizeof(key)); + + if (pos > 0) { + auto vpos = cdb_datapos(cdb); + auto vlen = cdb_datalen(cdb); + + if (vlen == sizeof(double)) { + double ret; + cdb_read(cdb, (void *)&ret, vlen, vpos); + + return ret; + } + } + + return std::nullopt; +} + +template +static inline auto +cdb_get_key_as_float_pair(struct cdb *cdb, T key) -> std::optional> +{ + auto pos = cdb_find(cdb, (void *)&key, sizeof(key)); + + if (pos > 0) { + auto vpos = cdb_datapos(cdb); + auto vlen = cdb_datalen(cdb); + + if (vlen == sizeof(float) * 2) { + union { + struct { + float v1; + float v2; + } d; + char c[sizeof(float) * 2]; + } u; + cdb_read(cdb, (void *)u.c, vlen, vpos); + + return std::make_pair(u.d.v1, u.d.v2); + } + } + + return std::nullopt; +} + + +auto +ro_backend::load_cdb() -> tl::expected +{ + if (!db) { + return tl::make_unexpected("no database loaded"); + } + + /* Now get number of learns */ + std::int64_t cdb_key; + static const char learn_spam_key[8] = "lrnspam", learn_ham_key[8] = "lrnham"; + + auto check_key = [&](const char *key, std::uint64_t &target) -> tl::expected { + memcpy((void *)&cdb_key, key, sizeof(cdb_key)); + + auto maybe_value = cdb_get_key_as_double(db.get(), cdb_key); + + if (!maybe_value) { + return tl::make_unexpected(fmt::format("missing {} key", key)); + } + + // Convert from double to int + target = (std::uint64_t)maybe_value.value(); + + return true; + }; + + auto res = check_key(learn_spam_key, learns_spam); + + if (!res) { + return res; + } + + res = check_key(learn_ham_key, learns_ham); + + if (!res) { + return res; + } + + loaded = true; + + return true; // expected +} + +auto +ro_backend::process_token(const rspamd_token_t *tok) const -> std::optional +{ + if (!loaded) { + return std::nullopt; + } + + auto maybe_value = cdb_get_key_as_float_pair(db.get(), tok->data); + + if (maybe_value) { + auto [spam_count, ham_count] = maybe_value.value(); + + if (st->stcf->is_spam) { + return spam_count; + } + else { + return ham_count; + } + } + + return std::nullopt; +} + +auto +open_cdb(struct rspamd_statfile *st) -> tl::expected +{ + const auto *stf = st->stcf; + + const auto *filename = ucl_object_lookup_any(stf->opts, + "filename", "path", "cdb", nullptr); + + if (filename && ucl_object_type(filename) == UCL_STRING) { + const auto *path = ucl_object_tostring(filename); + + auto fd = rspamd_file_xopen(path, O_RDONLY, 0, true); + + if (fd == -1) { + return tl::make_unexpected(fmt::format("cannot open {}: {}", + path, strerror(errno))); + } + + auto &&cdbs = std::make_unique(); + + if (cdb_init(cdbs.get(), fd) == -1) { + return tl::make_unexpected(fmt::format("cannot init cdb in {}: {}", + path, strerror(errno))); + } + + ro_backend bk{st, std::move(cdbs)}; + + auto res = bk.load_cdb(); + + if (!res) { + return tl::make_unexpected(res.error()); + } + + return bk; + } + else { + return tl::make_unexpected("missing/malformed filename attribute"); + } +} + } /* C exports */ @@ -32,6 +227,18 @@ rspamd_cdb_init(struct rspamd_stat_ctx* ctx, struct rspamd_config* cfg, struct rspamd_statfile* st) { + auto maybe_backend = rspamd::stat::cdb::open_cdb(st); + + if (maybe_backend) { + /* Move into a new pointer */ + auto *result = new rspamd::stat::cdb::ro_backend(std::move(maybe_backend.value())); + + return result; + } + else { + msg_err_config("cannot load cdb backend: %s", maybe_backend.error().c_str()); + } + return nullptr; } gpointer @@ -40,8 +247,10 @@ rspamd_cdb_runtime(struct rspamd_task* task, gboolean learn, gpointer ctx) { - return nullptr; + /* In CDB we don't have any dynamic stuff */ + return ctx; } + gboolean rspamd_cdb_process_tokens(struct rspamd_task* task, GPtrArray* tokens, -- 2.39.5