Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

stat_config.c 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "stat_api.h"
  18. #include "rspamd.h"
  19. #include "cfg_rcl.h"
  20. #include "stat_internal.h"
  21. #include "lua/lua_common.h"
  22. static struct rspamd_stat_ctx *stat_ctx = NULL;
  23. static struct rspamd_stat_classifier lua_classifier = {
  24. .name = "lua",
  25. .init_func = lua_classifier_init,
  26. .classify_func = lua_classifier_classify,
  27. .learn_spam_func = lua_classifier_learn_spam,
  28. .fin_func = NULL,
  29. };
  30. static struct rspamd_stat_classifier stat_classifiers[] = {
  31. {
  32. .name = "bayes",
  33. .init_func = bayes_init,
  34. .classify_func = bayes_classify,
  35. .learn_spam_func = bayes_learn_spam,
  36. .fin_func = bayes_fin,
  37. }
  38. };
  39. static struct rspamd_stat_tokenizer stat_tokenizers[] = {
  40. {
  41. .name = "osb-text",
  42. .get_config = rspamd_tokenizer_osb_get_config,
  43. .tokenize_func = rspamd_tokenizer_osb,
  44. },
  45. {
  46. .name = "osb",
  47. .get_config = rspamd_tokenizer_osb_get_config,
  48. .tokenize_func = rspamd_tokenizer_osb,
  49. },
  50. };
  51. #define RSPAMD_STAT_BACKEND_ELT(nam, eltn) { \
  52. .name = #nam, \
  53. .read_only = false, \
  54. .init = rspamd_##eltn##_init, \
  55. .runtime = rspamd_##eltn##_runtime, \
  56. .process_tokens = rspamd_##eltn##_process_tokens, \
  57. .finalize_process = rspamd_##eltn##_finalize_process, \
  58. .learn_tokens = rspamd_##eltn##_learn_tokens, \
  59. .finalize_learn = rspamd_##eltn##_finalize_learn, \
  60. .total_learns = rspamd_##eltn##_total_learns, \
  61. .inc_learns = rspamd_##eltn##_inc_learns, \
  62. .dec_learns = rspamd_##eltn##_dec_learns, \
  63. .get_stat = rspamd_##eltn##_get_stat, \
  64. .load_tokenizer_config = rspamd_##eltn##_load_tokenizer_config, \
  65. .close = rspamd_##eltn##_close \
  66. }
  67. #define RSPAMD_STAT_BACKEND_ELT_READONLY(nam, eltn) { \
  68. .name = #nam, \
  69. .read_only = true, \
  70. .init = rspamd_##eltn##_init, \
  71. .runtime = rspamd_##eltn##_runtime, \
  72. .process_tokens = rspamd_##eltn##_process_tokens, \
  73. .finalize_process = rspamd_##eltn##_finalize_process, \
  74. .learn_tokens = NULL, \
  75. .finalize_learn = NULL, \
  76. .total_learns = rspamd_##eltn##_total_learns, \
  77. .inc_learns = NULL, \
  78. .dec_learns = NULL, \
  79. .get_stat = rspamd_##eltn##_get_stat, \
  80. .load_tokenizer_config = rspamd_##eltn##_load_tokenizer_config, \
  81. .close = rspamd_##eltn##_close \
  82. }
  83. static struct rspamd_stat_backend stat_backends[] = {
  84. RSPAMD_STAT_BACKEND_ELT(mmap, mmaped_file),
  85. RSPAMD_STAT_BACKEND_ELT(sqlite3, sqlite3),
  86. RSPAMD_STAT_BACKEND_ELT_READONLY(cdb, cdb),
  87. RSPAMD_STAT_BACKEND_ELT(redis, redis)
  88. };
  89. #define RSPAMD_STAT_CACHE_ELT(nam, eltn) { \
  90. .name = #nam, \
  91. .init = rspamd_stat_cache_##eltn##_init, \
  92. .runtime = rspamd_stat_cache_##eltn##_runtime, \
  93. .check = rspamd_stat_cache_##eltn##_check, \
  94. .learn = rspamd_stat_cache_##eltn##_learn, \
  95. .close = rspamd_stat_cache_##eltn##_close \
  96. }
  97. static struct rspamd_stat_cache stat_caches[] = {
  98. RSPAMD_STAT_CACHE_ELT(sqlite3, sqlite3),
  99. RSPAMD_STAT_CACHE_ELT(redis, redis),
  100. };
  101. void
  102. rspamd_stat_init (struct rspamd_config *cfg, struct ev_loop *ev_base)
  103. {
  104. GList *cur, *curst;
  105. struct rspamd_classifier_config *clf;
  106. struct rspamd_statfile_config *stf;
  107. struct rspamd_stat_backend *bk;
  108. struct rspamd_statfile *st;
  109. struct rspamd_classifier *cl;
  110. const ucl_object_t *cache_obj = NULL, *cache_name_obj;
  111. const gchar *cache_name = NULL;
  112. lua_State *L = cfg->lua_state;
  113. guint lua_classifiers_cnt = 0, i;
  114. gboolean skip_cache = FALSE;
  115. if (stat_ctx == NULL) {
  116. stat_ctx = g_malloc0 (sizeof (*stat_ctx));
  117. }
  118. lua_getglobal (L, "rspamd_classifiers");
  119. if (lua_type (L, -1) == LUA_TTABLE) {
  120. lua_pushnil (L);
  121. while (lua_next (L, -2) != 0) {
  122. lua_classifiers_cnt ++;
  123. lua_pop (L, 1);
  124. }
  125. }
  126. lua_pop (L, 1);
  127. stat_ctx->classifiers_count = G_N_ELEMENTS (stat_classifiers) +
  128. lua_classifiers_cnt;
  129. stat_ctx->classifiers_subrs = g_new0 (struct rspamd_stat_classifier,
  130. stat_ctx->classifiers_count);
  131. for (i = 0; i < G_N_ELEMENTS (stat_classifiers); i ++) {
  132. memcpy (&stat_ctx->classifiers_subrs[i], &stat_classifiers[i],
  133. sizeof (struct rspamd_stat_classifier));
  134. }
  135. lua_getglobal (L, "rspamd_classifiers");
  136. if (lua_type (L, -1) == LUA_TTABLE) {
  137. lua_pushnil (L);
  138. while (lua_next (L, -2) != 0) {
  139. lua_pushvalue (L, -2);
  140. memcpy (&stat_ctx->classifiers_subrs[i], &lua_classifier,
  141. sizeof (struct rspamd_stat_classifier));
  142. stat_ctx->classifiers_subrs[i].name = g_strdup (lua_tostring (L, -1));
  143. i ++;
  144. lua_pop (L, 2);
  145. }
  146. }
  147. lua_pop (L, 1);
  148. stat_ctx->backends_subrs = stat_backends;
  149. stat_ctx->backends_count = G_N_ELEMENTS (stat_backends);
  150. stat_ctx->tokenizers_subrs = stat_tokenizers;
  151. stat_ctx->tokenizers_count = G_N_ELEMENTS (stat_tokenizers);
  152. stat_ctx->caches_subrs = stat_caches;
  153. stat_ctx->caches_count = G_N_ELEMENTS (stat_caches);
  154. stat_ctx->cfg = cfg;
  155. stat_ctx->statfiles = g_ptr_array_new ();
  156. stat_ctx->classifiers = g_ptr_array_new ();
  157. stat_ctx->async_elts = g_queue_new ();
  158. stat_ctx->event_loop = ev_base;
  159. stat_ctx->lua_stat_tokens_ref = -1;
  160. /* Interact with lua_stat */
  161. if (luaL_dostring (L, "return require \"lua_stat\"") != 0) {
  162. msg_err_config ("cannot require lua_stat: %s",
  163. lua_tostring (L, -1));
  164. }
  165. else {
  166. #if LUA_VERSION_NUM >= 504
  167. lua_settop(L, -2);
  168. #endif
  169. if (lua_type (L, -1) != LUA_TTABLE) {
  170. msg_err_config ("lua stat must return "
  171. "table and not %s",
  172. lua_typename (L, lua_type (L, -1)));
  173. }
  174. else {
  175. lua_pushstring (L, "gen_stat_tokens");
  176. lua_gettable (L, -2);
  177. if (lua_type (L, -1) != LUA_TFUNCTION) {
  178. msg_err_config ("gen_stat_tokens must return "
  179. "function and not %s",
  180. lua_typename (L, lua_type (L, -1)));
  181. }
  182. else {
  183. /* Call this function to obtain closure */
  184. gint err_idx, ret;
  185. struct rspamd_config **pcfg;
  186. lua_pushcfunction (L, &rspamd_lua_traceback);
  187. err_idx = lua_gettop (L);
  188. lua_pushvalue (L, err_idx - 1);
  189. pcfg = lua_newuserdata (L, sizeof (*pcfg));
  190. *pcfg = cfg;
  191. rspamd_lua_setclass (L, "rspamd{config}", -1);
  192. if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
  193. msg_err_config ("call to gen_stat_tokens lua "
  194. "script failed (%d): %s", ret,
  195. lua_tostring (L, -1));
  196. }
  197. else {
  198. if (lua_type (L, -1) != LUA_TFUNCTION) {
  199. msg_err_config ("gen_stat_tokens invocation must return "
  200. "function and not %s",
  201. lua_typename (L, lua_type (L, -1)));
  202. }
  203. else {
  204. stat_ctx->lua_stat_tokens_ref = luaL_ref (L, LUA_REGISTRYINDEX);
  205. }
  206. }
  207. }
  208. }
  209. }
  210. /* Cleanup mess */
  211. lua_settop (L, 0);
  212. /* Create statfiles from the classifiers */
  213. cur = cfg->classifiers;
  214. while (cur) {
  215. bk = NULL;
  216. clf = cur->data;
  217. cl = g_malloc0 (sizeof (*cl));
  218. cl->cfg = clf;
  219. cl->ctx = stat_ctx;
  220. cl->statfiles_ids = g_array_new (FALSE, FALSE, sizeof (gint));
  221. cl->subrs = rspamd_stat_get_classifier (clf->classifier);
  222. if (cl->subrs == NULL) {
  223. g_free (cl);
  224. msg_err_config ("cannot init classifier type %s", clf->name);
  225. cur = g_list_next (cur);
  226. continue;
  227. }
  228. if (!cl->subrs->init_func (cfg, ev_base, cl)) {
  229. g_free (cl);
  230. msg_err_config ("cannot init classifier type %s", clf->name);
  231. cur = g_list_next (cur);
  232. continue;
  233. }
  234. if (!(clf->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
  235. bk = rspamd_stat_get_backend (clf->backend);
  236. if (bk == NULL) {
  237. msg_err_config ("cannot get backend of type %s, so disable classifier"
  238. " %s completely", clf->backend, clf->name);
  239. cur = g_list_next (cur);
  240. continue;
  241. }
  242. }
  243. else {
  244. /* This actually is not implemented so it should never happen */
  245. g_free (cl);
  246. cur = g_list_next (cur);
  247. continue;
  248. }
  249. /* XXX:
  250. * Here we get the first classifier tokenizer config as the only one
  251. * We NO LONGER support multiple tokenizers per rspamd instance
  252. */
  253. if (stat_ctx->tkcf == NULL) {
  254. stat_ctx->tokenizer = rspamd_stat_get_tokenizer (clf->tokenizer->name);
  255. g_assert (stat_ctx->tokenizer != NULL);
  256. stat_ctx->tkcf = stat_ctx->tokenizer->get_config (cfg->cfg_pool,
  257. clf->tokenizer, NULL);
  258. }
  259. /* Init classifier cache */
  260. cache_name = NULL;
  261. if (!bk->read_only) {
  262. if (clf->opts) {
  263. cache_obj = ucl_object_lookup(clf->opts, "cache");
  264. cache_name_obj = NULL;
  265. if (cache_obj && ucl_object_type(cache_obj) == UCL_NULL) {
  266. skip_cache = TRUE;
  267. }
  268. else {
  269. if (cache_obj) {
  270. cache_name_obj = ucl_object_lookup_any(cache_obj,
  271. "name", "type", NULL);
  272. }
  273. if (cache_name_obj) {
  274. cache_name = ucl_object_tostring(cache_name_obj);
  275. }
  276. }
  277. }
  278. }
  279. else {
  280. skip_cache = true;
  281. }
  282. if (cache_name == NULL && !skip_cache) {
  283. /* We assume that learn cache is the same as backend */
  284. cache_name = clf->backend;
  285. }
  286. curst = clf->statfiles;
  287. while (curst) {
  288. stf = curst->data;
  289. st = g_malloc0 (sizeof (*st));
  290. st->classifier = cl;
  291. st->stcf = stf;
  292. if (!(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
  293. st->backend = bk;
  294. st->bkcf = bk->init (stat_ctx, cfg, st);
  295. msg_info_config ("added backend %s for symbol %s",
  296. bk->name, stf->symbol);
  297. }
  298. else {
  299. msg_debug_config ("added backend-less statfile for symbol %s",
  300. stf->symbol);
  301. }
  302. /* XXX: bad hack to pass statfiles configuration to cache */
  303. if (cl->cache == NULL && !skip_cache) {
  304. cl->cache = rspamd_stat_get_cache (cache_name);
  305. g_assert (cl->cache != NULL);
  306. cl->cachecf = cl->cache->init (stat_ctx, cfg, st, cache_obj);
  307. if (cl->cachecf == NULL) {
  308. msg_err_config ("error adding cache %s for symbol %s",
  309. cl->cache->name, stf->symbol);
  310. cl->cache = NULL;
  311. }
  312. else {
  313. msg_debug_config ("added cache %s for symbol %s",
  314. cl->cache->name, stf->symbol);
  315. }
  316. }
  317. if (st->bkcf == NULL &&
  318. !(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
  319. msg_err_config ("cannot init backend %s for statfile %s",
  320. clf->backend, stf->symbol);
  321. g_free (st);
  322. }
  323. else {
  324. st->id = stat_ctx->statfiles->len;
  325. g_ptr_array_add (stat_ctx->statfiles, st);
  326. g_array_append_val (cl->statfiles_ids, st->id);
  327. }
  328. curst = curst->next;
  329. }
  330. g_ptr_array_add (stat_ctx->classifiers, cl);
  331. cur = cur->next;
  332. }
  333. }
  334. void
  335. rspamd_stat_close (void)
  336. {
  337. struct rspamd_classifier *cl;
  338. struct rspamd_statfile *st;
  339. struct rspamd_stat_ctx *st_ctx;
  340. struct rspamd_stat_async_elt *aelt;
  341. GList *cur;
  342. guint i, j;
  343. gint id;
  344. st_ctx = rspamd_stat_get_ctx ();
  345. g_assert (st_ctx != NULL);
  346. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  347. cl = g_ptr_array_index (st_ctx->classifiers, i);
  348. for (j = 0; j < cl->statfiles_ids->len; j ++) {
  349. id = g_array_index (cl->statfiles_ids, gint, j);
  350. st = g_ptr_array_index (st_ctx->statfiles, id);
  351. if (!(st->classifier->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
  352. st->backend->close (st->bkcf);
  353. }
  354. g_free (st);
  355. }
  356. if (cl->cache && cl->cachecf) {
  357. cl->cache->close (cl->cachecf);
  358. }
  359. g_array_free (cl->statfiles_ids, TRUE);
  360. if (cl->subrs->fin_func) {
  361. cl->subrs->fin_func (cl);
  362. }
  363. g_free (cl);
  364. }
  365. cur = st_ctx->async_elts->head;
  366. while (cur) {
  367. aelt = cur->data;
  368. REF_RELEASE (aelt);
  369. cur = g_list_next (cur);
  370. }
  371. g_queue_free (stat_ctx->async_elts);
  372. g_ptr_array_free (st_ctx->statfiles, TRUE);
  373. g_ptr_array_free (st_ctx->classifiers, TRUE);
  374. if (st_ctx->lua_stat_tokens_ref != -1) {
  375. luaL_unref (st_ctx->cfg->lua_state, LUA_REGISTRYINDEX,
  376. st_ctx->lua_stat_tokens_ref);
  377. }
  378. g_free (st_ctx);
  379. /* Set global var to NULL */
  380. stat_ctx = NULL;
  381. }
  382. struct rspamd_stat_ctx *
  383. rspamd_stat_get_ctx (void)
  384. {
  385. return stat_ctx;
  386. }
  387. struct rspamd_stat_classifier *
  388. rspamd_stat_get_classifier (const gchar *name)
  389. {
  390. guint i;
  391. if (name == NULL || name[0] == '\0') {
  392. name = RSPAMD_DEFAULT_CLASSIFIER;
  393. }
  394. for (i = 0; i < stat_ctx->classifiers_count; i ++) {
  395. if (strcmp (name, stat_ctx->classifiers_subrs[i].name) == 0) {
  396. return &stat_ctx->classifiers_subrs[i];
  397. }
  398. }
  399. msg_err ("cannot find classifier named %s", name);
  400. return NULL;
  401. }
  402. struct rspamd_stat_backend *
  403. rspamd_stat_get_backend (const gchar *name)
  404. {
  405. guint i;
  406. if (name == NULL || name[0] == '\0') {
  407. name = RSPAMD_DEFAULT_BACKEND;
  408. }
  409. for (i = 0; i < stat_ctx->backends_count; i ++) {
  410. if (strcmp (name, stat_ctx->backends_subrs[i].name) == 0) {
  411. return &stat_ctx->backends_subrs[i];
  412. }
  413. }
  414. msg_err ("cannot find backend named %s", name);
  415. return NULL;
  416. }
  417. struct rspamd_stat_tokenizer *
  418. rspamd_stat_get_tokenizer (const gchar *name)
  419. {
  420. guint i;
  421. if (name == NULL || name[0] == '\0') {
  422. name = RSPAMD_DEFAULT_TOKENIZER;
  423. }
  424. for (i = 0; i < stat_ctx->tokenizers_count; i ++) {
  425. if (strcmp (name, stat_ctx->tokenizers_subrs[i].name) == 0) {
  426. return &stat_ctx->tokenizers_subrs[i];
  427. }
  428. }
  429. msg_err ("cannot find tokenizer named %s", name);
  430. return NULL;
  431. }
  432. struct rspamd_stat_cache *
  433. rspamd_stat_get_cache (const gchar *name)
  434. {
  435. guint i;
  436. if (name == NULL || name[0] == '\0') {
  437. name = RSPAMD_DEFAULT_CACHE;
  438. }
  439. for (i = 0; i < stat_ctx->caches_count; i++) {
  440. if (strcmp (name, stat_ctx->caches_subrs[i].name) == 0) {
  441. return &stat_ctx->caches_subrs[i];
  442. }
  443. }
  444. msg_err ("cannot find cache named %s", name);
  445. return NULL;
  446. }
  447. static void
  448. rspamd_async_elt_dtor (struct rspamd_stat_async_elt *elt)
  449. {
  450. if (elt->cleanup) {
  451. elt->cleanup (elt, elt->ud);
  452. }
  453. ev_timer_stop (elt->event_loop, &elt->timer_ev);
  454. g_free (elt);
  455. }
  456. static void
  457. rspamd_async_elt_on_timer (EV_P_ ev_timer *w, int revents)
  458. {
  459. struct rspamd_stat_async_elt *elt = (struct rspamd_stat_async_elt *)w->data;
  460. gdouble jittered_time;
  461. if (elt->enabled) {
  462. elt->handler (elt, elt->ud);
  463. }
  464. jittered_time = rspamd_time_jitter (elt->timeout, 0);
  465. elt->timer_ev.repeat = jittered_time;
  466. ev_timer_again (EV_A_ w);
  467. }
  468. struct rspamd_stat_async_elt*
  469. rspamd_stat_ctx_register_async (rspamd_stat_async_handler handler,
  470. rspamd_stat_async_cleanup cleanup,
  471. gpointer d,
  472. gdouble timeout)
  473. {
  474. struct rspamd_stat_async_elt *elt;
  475. struct rspamd_stat_ctx *st_ctx;
  476. st_ctx = rspamd_stat_get_ctx ();
  477. g_assert (st_ctx != NULL);
  478. elt = g_malloc0 (sizeof (*elt));
  479. elt->handler = handler;
  480. elt->cleanup = cleanup;
  481. elt->ud = d;
  482. elt->timeout = timeout;
  483. elt->event_loop = st_ctx->event_loop;
  484. REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
  485. /* Enabled by default */
  486. if (st_ctx->event_loop) {
  487. elt->enabled = TRUE;
  488. /*
  489. * First we set timeval to zero as we want cb to be executed as
  490. * fast as possible
  491. */
  492. elt->timer_ev.data = elt;
  493. ev_timer_init (&elt->timer_ev, rspamd_async_elt_on_timer,
  494. 0.1, 0.0);
  495. ev_timer_start (st_ctx->event_loop, &elt->timer_ev);
  496. }
  497. else {
  498. elt->enabled = FALSE;
  499. }
  500. g_queue_push_tail (st_ctx->async_elts, elt);
  501. return elt;
  502. }