You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

cdb_backend.cxx 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * CDB read only statistics backend
  18. */
  19. #include "config.h"
  20. #include "stat_internal.h"
  21. #include "contrib/cdb/cdb.h"
  22. #include <utility>
  23. #include <memory>
  24. #include <string>
  25. #include <optional>
  26. #include "contrib/expected/expected.hpp"
  27. #include "contrib/ankerl/unordered_dense.h"
  28. #include "fmt/core.h"
  29. namespace rspamd::stat::cdb {
  30. /*
  31. * Utility class to share cdb instances over statfiles instances, as each
  32. * cdb has tokens for both ham and spam classes
  33. */
  34. class cdb_shared_storage {
  35. public:
  36. using cdb_element_t = std::shared_ptr<struct cdb>;
  37. cdb_shared_storage() = default;
  38. auto get_cdb(const char *path) const -> std::optional<cdb_element_t>
  39. {
  40. auto found = elts.find(path);
  41. if (found != elts.end()) {
  42. if (!found->second.expired()) {
  43. return found->second.lock();
  44. }
  45. }
  46. return std::nullopt;
  47. }
  48. /* Create a new smart pointer over POD cdb structure */
  49. static auto new_cdb() -> cdb_element_t
  50. {
  51. auto ret = cdb_element_t(new struct cdb, cdb_deleter());
  52. memset(ret.get(), 0, sizeof(struct cdb));
  53. return ret;
  54. }
  55. /* Enclose cdb into storage */
  56. auto push_cdb(const char *path, cdb_element_t cdbp) -> cdb_element_t
  57. {
  58. auto found = elts.find(path);
  59. if (found != elts.end()) {
  60. if (found->second.expired()) {
  61. /* OK, move in lieu of the expired weak pointer */
  62. found->second = cdbp;
  63. return cdbp;
  64. }
  65. else {
  66. /*
  67. * Existing and not expired, return the existing one
  68. */
  69. return found->second.lock();
  70. }
  71. }
  72. else {
  73. /* Not existing, make a weak ptr and return the original */
  74. elts.emplace(path, std::weak_ptr<struct cdb>(cdbp));
  75. return cdbp;
  76. }
  77. }
  78. private:
  79. /*
  80. * We store weak pointers here to allow owning cdb statfiles to free
  81. * expensive cdb before this cache is terminated (e.g. on dynamic cdb reload)
  82. */
  83. ankerl::unordered_dense::map<std::string, std::weak_ptr<struct cdb>> elts;
  84. struct cdb_deleter {
  85. void operator()(struct cdb *c) const
  86. {
  87. cdb_free(c);
  88. delete c;
  89. }
  90. };
  91. };
  92. static cdb_shared_storage cdb_shared_storage;
  93. class ro_backend final {
  94. public:
  95. explicit ro_backend(struct rspamd_statfile *_st, cdb_shared_storage::cdb_element_t _db)
  96. : st(_st), db(std::move(_db))
  97. {
  98. }
  99. ro_backend() = delete;
  100. ro_backend(const ro_backend &) = delete;
  101. ro_backend(ro_backend &&other) noexcept
  102. {
  103. *this = std::move(other);
  104. }
  105. ro_backend &operator=(ro_backend &&other) noexcept
  106. {
  107. std::swap(st, other.st);
  108. std::swap(db, other.db);
  109. std::swap(loaded, other.loaded);
  110. std::swap(learns_spam, other.learns_spam);
  111. std::swap(learns_ham, other.learns_ham);
  112. return *this;
  113. }
  114. ~ro_backend()
  115. {
  116. }
  117. auto load_cdb() -> tl::expected<bool, std::string>;
  118. auto process_token(const rspamd_token_t *tok) const -> std::optional<float>;
  119. constexpr auto is_spam() const -> bool
  120. {
  121. return st->stcf->is_spam;
  122. }
  123. auto get_learns() const -> std::uint64_t
  124. {
  125. if (is_spam()) {
  126. return learns_spam;
  127. }
  128. else {
  129. return learns_ham;
  130. }
  131. }
  132. auto get_total_learns() const -> std::uint64_t
  133. {
  134. return learns_spam + learns_ham;
  135. }
  136. private:
  137. struct rspamd_statfile *st;
  138. cdb_shared_storage::cdb_element_t db;
  139. bool loaded = false;
  140. std::uint64_t learns_spam = 0;
  141. std::uint64_t learns_ham = 0;
  142. };
  143. template<typename T>
  144. static inline auto
  145. cdb_get_key_as_int64(struct cdb *cdb, T key) -> std::optional<std::int64_t>
  146. {
  147. auto pos = cdb_find(cdb, (void *) &key, sizeof(key));
  148. if (pos > 0) {
  149. auto vpos = cdb_datapos(cdb);
  150. auto vlen = cdb_datalen(cdb);
  151. if (vlen == sizeof(std::int64_t)) {
  152. std::int64_t ret;
  153. cdb_read(cdb, (void *) &ret, vlen, vpos);
  154. return ret;
  155. }
  156. }
  157. return std::nullopt;
  158. }
  159. template<typename T>
  160. static inline auto
  161. cdb_get_key_as_float_pair(struct cdb *cdb, T key) -> std::optional<std::pair<float, float>>
  162. {
  163. auto pos = cdb_find(cdb, (void *) &key, sizeof(key));
  164. if (pos > 0) {
  165. auto vpos = cdb_datapos(cdb);
  166. auto vlen = cdb_datalen(cdb);
  167. if (vlen == sizeof(float) * 2) {
  168. union {
  169. struct {
  170. float v1;
  171. float v2;
  172. } d;
  173. char c[sizeof(float) * 2];
  174. } u;
  175. cdb_read(cdb, (void *) u.c, vlen, vpos);
  176. return std::make_pair(u.d.v1, u.d.v2);
  177. }
  178. }
  179. return std::nullopt;
  180. }
  181. auto ro_backend::load_cdb() -> tl::expected<bool, std::string>
  182. {
  183. if (!db) {
  184. return tl::make_unexpected("no database loaded");
  185. }
  186. /* Now get number of learns */
  187. std::int64_t cdb_key;
  188. static const char learn_spam_key[9] = "_lrnspam", learn_ham_key[9] = "_lrnham_";
  189. auto check_key = [&](const char *key, std::uint64_t &target) -> tl::expected<bool, std::string> {
  190. memcpy((void *) &cdb_key, key, sizeof(cdb_key));
  191. auto maybe_value = cdb_get_key_as_int64(db.get(), cdb_key);
  192. if (!maybe_value) {
  193. return tl::make_unexpected(fmt::format("missing {} key", key));
  194. }
  195. target = (std::uint64_t) maybe_value.value();
  196. return true;
  197. };
  198. auto res = check_key(learn_spam_key, learns_spam);
  199. if (!res) {
  200. return res;
  201. }
  202. res = check_key(learn_ham_key, learns_ham);
  203. if (!res) {
  204. return res;
  205. }
  206. loaded = true;
  207. return true;// expected
  208. }
  209. auto ro_backend::process_token(const rspamd_token_t *tok) const -> std::optional<float>
  210. {
  211. if (!loaded) {
  212. return std::nullopt;
  213. }
  214. auto maybe_value = cdb_get_key_as_float_pair(db.get(), tok->data);
  215. if (maybe_value) {
  216. auto [spam_count, ham_count] = maybe_value.value();
  217. if (is_spam()) {
  218. return spam_count;
  219. }
  220. else {
  221. return ham_count;
  222. }
  223. }
  224. return std::nullopt;
  225. }
  226. auto open_cdb(struct rspamd_statfile *st) -> tl::expected<ro_backend, std::string>
  227. {
  228. const char *path = nullptr;
  229. const auto *stf = st->stcf;
  230. auto get_filename = [](const ucl_object_t *obj) -> const char * {
  231. const auto *filename = ucl_object_lookup_any(obj,
  232. "filename", "path", "cdb", nullptr);
  233. if (filename && ucl_object_type(filename) == UCL_STRING) {
  234. return ucl_object_tostring(filename);
  235. }
  236. return nullptr;
  237. };
  238. /* First search in backend configuration */
  239. const auto *obj = ucl_object_lookup(st->classifier->cfg->opts, "backend");
  240. if (obj != NULL && ucl_object_type(obj) == UCL_OBJECT) {
  241. path = get_filename(obj);
  242. }
  243. /* Now try statfiles config */
  244. if (!path && stf->opts) {
  245. path = get_filename(stf->opts);
  246. }
  247. /* Now try classifier config */
  248. if (!path && st->classifier->cfg->opts) {
  249. path = get_filename(st->classifier->cfg->opts);
  250. }
  251. if (!path) {
  252. return tl::make_unexpected("missing/malformed filename attribute");
  253. }
  254. auto cached_cdb_maybe = cdb_shared_storage.get_cdb(path);
  255. cdb_shared_storage::cdb_element_t cdbp;
  256. if (!cached_cdb_maybe) {
  257. auto fd = rspamd_file_xopen(path, O_RDONLY, 0, true);
  258. if (fd == -1) {
  259. return tl::make_unexpected(fmt::format("cannot open {}: {}",
  260. path, strerror(errno)));
  261. }
  262. cdbp = cdb_shared_storage::new_cdb();
  263. if (cdb_init(cdbp.get(), fd) == -1) {
  264. close(fd);
  265. return tl::make_unexpected(fmt::format("cannot init cdb in {}: {}",
  266. path, strerror(errno)));
  267. }
  268. cdbp = cdb_shared_storage.push_cdb(path, cdbp);
  269. close(fd);
  270. }
  271. else {
  272. cdbp = cached_cdb_maybe.value();
  273. }
  274. if (!cdbp) {
  275. return tl::make_unexpected(fmt::format("cannot init cdb in {}: internal error",
  276. path));
  277. }
  278. ro_backend bk{st, std::move(cdbp)};
  279. auto res = bk.load_cdb();
  280. if (!res) {
  281. return tl::make_unexpected(res.error());
  282. }
  283. return bk;
  284. }
  285. }// namespace rspamd::stat::cdb
  286. #define CDB_FROM_RAW(p) (reinterpret_cast<rspamd::stat::cdb::ro_backend *>(p))
  287. /* C exports */
  288. gpointer
  289. rspamd_cdb_init(struct rspamd_stat_ctx *ctx,
  290. struct rspamd_config *cfg,
  291. struct rspamd_statfile *st)
  292. {
  293. auto maybe_backend = rspamd::stat::cdb::open_cdb(st);
  294. if (maybe_backend) {
  295. /* Move into a new pointer */
  296. auto *result = new rspamd::stat::cdb::ro_backend(std::move(maybe_backend.value()));
  297. return result;
  298. }
  299. else {
  300. msg_err_config("cannot load cdb backend: %s", maybe_backend.error().c_str());
  301. }
  302. return nullptr;
  303. }
  304. gpointer
  305. rspamd_cdb_runtime(struct rspamd_task *task,
  306. struct rspamd_statfile_config *stcf,
  307. gboolean learn,
  308. gpointer ctx,
  309. int _id)
  310. {
  311. /* In CDB we don't have any dynamic stuff */
  312. return ctx;
  313. }
  314. gboolean
  315. rspamd_cdb_process_tokens(struct rspamd_task *task,
  316. GPtrArray *tokens,
  317. int id,
  318. gpointer runtime)
  319. {
  320. auto *cdbp = CDB_FROM_RAW(runtime);
  321. bool seen_values = false;
  322. for (auto i = 0u; i < tokens->len; i++) {
  323. rspamd_token_t *tok;
  324. tok = reinterpret_cast<rspamd_token_t *>(g_ptr_array_index(tokens, i));
  325. auto res = cdbp->process_token(tok);
  326. if (res) {
  327. tok->values[id] = res.value();
  328. seen_values = true;
  329. }
  330. else {
  331. tok->values[id] = 0;
  332. }
  333. }
  334. if (seen_values) {
  335. if (cdbp->is_spam()) {
  336. task->flags |= RSPAMD_TASK_FLAG_HAS_SPAM_TOKENS;
  337. }
  338. else {
  339. task->flags |= RSPAMD_TASK_FLAG_HAS_HAM_TOKENS;
  340. }
  341. }
  342. return true;
  343. }
  344. gboolean
  345. rspamd_cdb_finalize_process(struct rspamd_task *task,
  346. gpointer runtime,
  347. gpointer ctx)
  348. {
  349. return true;
  350. }
  351. gboolean
  352. rspamd_cdb_learn_tokens(struct rspamd_task *task,
  353. GPtrArray *tokens,
  354. int id,
  355. gpointer ctx)
  356. {
  357. return false;
  358. }
  359. gboolean
  360. rspamd_cdb_finalize_learn(struct rspamd_task *task,
  361. gpointer runtime,
  362. gpointer ctx,
  363. GError **err)
  364. {
  365. return false;
  366. }
  367. gulong rspamd_cdb_total_learns(struct rspamd_task *task,
  368. gpointer runtime,
  369. gpointer ctx)
  370. {
  371. auto *cdbp = CDB_FROM_RAW(ctx);
  372. return cdbp->get_total_learns();
  373. }
  374. gulong
  375. rspamd_cdb_inc_learns(struct rspamd_task *task,
  376. gpointer runtime,
  377. gpointer ctx)
  378. {
  379. return (gulong) -1;
  380. }
  381. gulong
  382. rspamd_cdb_dec_learns(struct rspamd_task *task,
  383. gpointer runtime,
  384. gpointer ctx)
  385. {
  386. return (gulong) -1;
  387. }
  388. gulong
  389. rspamd_cdb_learns(struct rspamd_task *task,
  390. gpointer runtime,
  391. gpointer ctx)
  392. {
  393. auto *cdbp = CDB_FROM_RAW(ctx);
  394. return cdbp->get_learns();
  395. }
  396. ucl_object_t *
  397. rspamd_cdb_get_stat(gpointer runtime, gpointer ctx)
  398. {
  399. return nullptr;
  400. }
  401. gpointer
  402. rspamd_cdb_load_tokenizer_config(gpointer runtime, gsize *len)
  403. {
  404. return nullptr;
  405. }
  406. void rspamd_cdb_close(gpointer ctx)
  407. {
  408. auto *cdbp = CDB_FROM_RAW(ctx);
  409. delete cdbp;
  410. }