You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

stat_process.c 33KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "stat_api.h"
  18. #include "rspamd.h"
  19. #include "stat_internal.h"
  20. #include "libmime/message.h"
  21. #include "libmime/images.h"
  22. #include "libserver/html.h"
  23. #include "lua/lua_common.h"
  24. #include "libserver/mempool_vars_internal.h"
  25. #include "utlist.h"
  26. #include <math.h>
  27. #define RSPAMD_CLASSIFY_OP 0
  28. #define RSPAMD_LEARN_OP 1
  29. #define RSPAMD_UNLEARN_OP 2
  30. static const gdouble similarity_treshold = 80.0;
  31. static void
  32. rspamd_stat_tokenize_header (struct rspamd_task *task,
  33. const gchar *name, const gchar *prefix, GArray *ar)
  34. {
  35. struct rspamd_mime_header *cur;
  36. GPtrArray *hdrs;
  37. guint i;
  38. rspamd_stat_token_t str;
  39. hdrs = g_hash_table_lookup (task->raw_headers, name);
  40. str.flags = RSPAMD_STAT_TOKEN_FLAG_META;
  41. if (hdrs != NULL) {
  42. PTR_ARRAY_FOREACH (hdrs, i, cur) {
  43. if (cur->name != NULL) {
  44. str.begin = cur->name;
  45. str.len = strlen (cur->name);
  46. g_array_append_val (ar, str);
  47. }
  48. if (cur->decoded != NULL) {
  49. str.begin = cur->decoded;
  50. str.len = strlen (cur->decoded);
  51. g_array_append_val (ar, str);
  52. }
  53. else if (cur->value != NULL) {
  54. str.begin = cur->value;
  55. str.len = strlen (cur->value);
  56. g_array_append_val (ar, str);
  57. }
  58. }
  59. msg_debug_task ("added stat tokens for header '%s'", name);
  60. }
  61. }
  62. static void
  63. rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
  64. struct rspamd_task *task)
  65. {
  66. struct rspamd_image *img;
  67. struct rspamd_mime_part *part;
  68. struct rspamd_mime_text_part *tp;
  69. GList *cur;
  70. GArray *ar;
  71. rspamd_stat_token_t elt;
  72. guint i;
  73. gchar tmpbuf[128];
  74. lua_State *L = task->cfg->lua_state;
  75. const gchar *headers_hash;
  76. struct rspamd_mime_header *hdr;
  77. ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
  78. elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
  79. /* Insert images */
  80. for (i = 0; i < task->parts->len; i ++) {
  81. part = g_ptr_array_index (task->parts, i);
  82. if ((part->flags & RSPAMD_MIME_PART_IMAGE) && part->specific.img) {
  83. img = part->specific.img;
  84. /* If an image has a linked HTML part, then we push its details to the stat */
  85. if (img->html_image) {
  86. elt.begin = (gchar *)"image";
  87. elt.len = 5;
  88. g_array_append_val (ar, elt);
  89. elt.begin = (gchar *)&img->html_image->height;
  90. elt.len = sizeof (img->html_image->height);
  91. g_array_append_val (ar, elt);
  92. elt.begin = (gchar *)&img->html_image->width;
  93. elt.len = sizeof (img->html_image->width);
  94. g_array_append_val (ar, elt);
  95. elt.begin = (gchar *)&img->type;
  96. elt.len = sizeof (img->type);
  97. g_array_append_val (ar, elt);
  98. if (img->filename) {
  99. elt.begin = (gchar *)img->filename;
  100. elt.len = strlen (elt.begin);
  101. g_array_append_val (ar, elt);
  102. }
  103. msg_debug_task ("added stat tokens for image '%s'", img->html_image->src);
  104. }
  105. }
  106. else if (part->cd && part->cd->filename.len > 0) {
  107. elt.begin = (gchar *)part->cd->filename.begin;
  108. elt.len = part->cd->filename.len;
  109. g_array_append_val (ar, elt);
  110. }
  111. }
  112. /* Process mime parts */
  113. for (i = 0; i < task->parts->len; i ++) {
  114. part = g_ptr_array_index (task->parts, i);
  115. if (IS_CT_MULTIPART (part->ct)) {
  116. elt.begin = (gchar *)part->ct->boundary.begin;
  117. elt.len = part->ct->boundary.len;
  118. if (elt.len) {
  119. msg_debug_task ("added stat tokens for mime boundary '%*s'",
  120. (gint)elt.len, elt.begin);
  121. g_array_append_val (ar, elt);
  122. }
  123. if (part->parsed_data.len > 1) {
  124. rspamd_snprintf (tmpbuf, sizeof (tmpbuf), "mime%d:%dlog",
  125. i, (gint)log2 (part->parsed_data.len));
  126. elt.begin = rspamd_mempool_strdup (task->task_pool, tmpbuf);
  127. elt.len = strlen (elt.begin);
  128. g_array_append_val (ar, elt);
  129. }
  130. }
  131. }
  132. /* Process text parts metadata */
  133. for (i = 0; i < task->text_parts->len; i ++) {
  134. tp = g_ptr_array_index (task->text_parts, i);
  135. if (tp->language != NULL && tp->language[0] != '\0') {
  136. elt.begin = (gchar *)tp->language;
  137. elt.len = strlen (elt.begin);
  138. msg_debug_task ("added stat tokens for part language '%s'", elt.begin);
  139. g_array_append_val (ar, elt);
  140. }
  141. if (tp->real_charset != NULL) {
  142. elt.begin = (gchar *)tp->real_charset;
  143. elt.len = strlen (elt.begin);
  144. msg_debug_task ("added stat tokens for part charset '%s'", elt.begin);
  145. g_array_append_val (ar, elt);
  146. }
  147. }
  148. cur = g_list_first (task->cfg->classify_headers);
  149. while (cur) {
  150. rspamd_stat_tokenize_header (task, cur->data, "UA:", ar);
  151. cur = g_list_next (cur);
  152. }
  153. /* Use headers order */
  154. headers_hash = rspamd_mempool_get_variable (task->task_pool,
  155. RSPAMD_MEMPOOL_HEADERS_HASH);
  156. if (headers_hash) {
  157. elt.begin = (gchar *)headers_hash;
  158. elt.len = 16;
  159. g_array_append_val (ar, elt);
  160. }
  161. /* Use more precise headers order */
  162. cur = g_list_first (task->headers_order->head);
  163. while (cur) {
  164. hdr = cur->data;
  165. if (hdr->name && hdr->type != RSPAMD_HEADER_RECEIVED) {
  166. elt.begin = hdr->name;
  167. elt.len = strlen (hdr->name);
  168. g_array_append_val (ar, elt);
  169. }
  170. cur = g_list_next (cur);
  171. }
  172. /* Use metatokens plugin from Lua */
  173. lua_getglobal (L, "rspamd_plugins");
  174. if (lua_type (L, -1) == LUA_TTABLE) {
  175. lua_pushstring (L, "stat_metatokens");
  176. lua_gettable (L, -2);
  177. if (lua_type (L, -1) == LUA_TTABLE) {
  178. gint old_top;
  179. old_top = lua_gettop (L);
  180. lua_pushstring (L, "callback");
  181. lua_gettable (L, -2);
  182. if (lua_type (L, -1) == LUA_TFUNCTION) {
  183. struct rspamd_task **ptask;
  184. ptask = lua_newuserdata (L, sizeof (*ptask));
  185. rspamd_lua_setclass (L, "rspamd{task}", -1);
  186. *ptask = task;
  187. if (lua_pcall (L, 1, LUA_MULTRET, 0) != 0) {
  188. msg_err_task ("stat_metatokens failed: %s",
  189. lua_tostring (L, -1));
  190. lua_pop (L, 1);
  191. } else {
  192. if (lua_gettop (L) > old_top &&
  193. lua_istable (L, old_top + 1)) {
  194. lua_pushvalue (L, old_top + 1);
  195. /* Iterate over table of tables */
  196. for (lua_pushnil (L); lua_next (L, -2);
  197. lua_pop (L, 1)) {
  198. elt.flags = RSPAMD_STAT_TOKEN_FLAG_META|
  199. RSPAMD_STAT_TOKEN_FLAG_LUA_META;
  200. if (lua_isnumber (L, -1)) {
  201. gdouble num = lua_tonumber (L, -1);
  202. guint8 *pnum = rspamd_mempool_alloc (
  203. task->task_pool,
  204. sizeof (num));
  205. msg_debug_task ("got metatoken number: %.2f",
  206. num);
  207. memcpy (pnum, &num, sizeof (num));
  208. elt.begin = (gchar *) pnum;
  209. elt.len = sizeof (num);
  210. g_array_append_val (ar, elt);
  211. } else if (lua_isstring (L, -1)) {
  212. const gchar *str;
  213. gsize tlen;
  214. str = lua_tolstring (L, -1, &tlen);
  215. guint8 *pstr = rspamd_mempool_alloc (
  216. task->task_pool,
  217. tlen);
  218. memcpy (pstr, str, tlen);
  219. msg_debug_task ("got metatoken string: %*s",
  220. (gint) tlen, str);
  221. elt.begin = (gchar *) pstr;
  222. elt.len = tlen;
  223. g_array_append_val (ar, elt);
  224. }
  225. else if (lua_istable (L, -1)) {
  226. /* Treat that as unigramms */
  227. for (lua_pushnil (L); lua_next (L, -2);
  228. lua_pop (L, 1)) {
  229. if (lua_isstring (L, -1)) {
  230. const gchar *str;
  231. gsize tlen;
  232. str = lua_tolstring (L, -1, &tlen);
  233. guint8 *pstr = rspamd_mempool_alloc (
  234. task->task_pool,
  235. tlen);
  236. memcpy (pstr, str, tlen);
  237. msg_debug_task ("got unigramm "
  238. "metatoken string: %*s",
  239. (gint) tlen, str);
  240. elt.begin = (gchar *) pstr;
  241. elt.len = tlen;
  242. elt.flags |= RSPAMD_STAT_TOKEN_FLAG_UNIGRAM;
  243. g_array_append_val (ar, elt);
  244. }
  245. }
  246. }
  247. }
  248. }
  249. }
  250. }
  251. }
  252. }
  253. lua_settop (L, 0);
  254. st_ctx->tokenizer->tokenize_func (st_ctx,
  255. task->task_pool,
  256. ar,
  257. TRUE,
  258. "META:",
  259. task->tokens);
  260. rspamd_mempool_add_destructor (task->task_pool,
  261. rspamd_array_free_hard, ar);
  262. }
  263. /*
  264. * Tokenize task using the tokenizer specified
  265. */
  266. void
  267. rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
  268. struct rspamd_task *task)
  269. {
  270. struct rspamd_mime_text_part *part;
  271. rspamd_cryptobox_hash_state_t hst;
  272. rspamd_stat_token_t *tok;
  273. rspamd_token_t *st_tok;
  274. GArray *words;
  275. gchar *sub = NULL;
  276. guint i, reserved_len = 0;
  277. gdouble *pdiff;
  278. guchar hout[rspamd_cryptobox_HASHBYTES];
  279. gchar *b32_hout;
  280. if (st_ctx == NULL) {
  281. st_ctx = rspamd_stat_get_ctx ();
  282. }
  283. g_assert (st_ctx != NULL);
  284. for (i = 0; i < task->text_parts->len; i++) {
  285. part = g_ptr_array_index (task->text_parts, i);
  286. if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
  287. reserved_len += part->utf_words->len;
  288. }
  289. /* XXX: normal window size */
  290. reserved_len += 5;
  291. }
  292. task->tokens = g_ptr_array_sized_new (reserved_len);
  293. rspamd_mempool_add_destructor (task->task_pool,
  294. rspamd_ptr_array_free_hard, task->tokens);
  295. pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
  296. for (i = 0; i < task->text_parts->len; i ++) {
  297. part = g_ptr_array_index (task->text_parts, i);
  298. if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
  299. st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
  300. part->utf_words, IS_PART_UTF (part),
  301. NULL, task->tokens);
  302. }
  303. if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
  304. msg_debug_task ("message has two common parts (%.2f), so skip the last one",
  305. *pdiff);
  306. break;
  307. }
  308. }
  309. if (task->subject != NULL) {
  310. sub = task->subject;
  311. }
  312. if (sub != NULL) {
  313. UText utxt = UTEXT_INITIALIZER;
  314. UErrorCode uc_err = U_ZERO_ERROR;
  315. gsize slen = strlen (sub);
  316. utext_openUTF8 (&utxt,
  317. sub,
  318. slen,
  319. &uc_err);
  320. words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
  321. NULL, NULL, NULL);
  322. if (words != NULL) {
  323. for (i = 0; i < words->len; i ++) {
  324. tok = &g_array_index (words, rspamd_stat_token_t, i);
  325. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
  326. }
  327. st_ctx->tokenizer->tokenize_func (st_ctx,
  328. task->task_pool,
  329. words,
  330. TRUE,
  331. "SUBJECT",
  332. task->tokens);
  333. rspamd_mempool_add_destructor (task->task_pool,
  334. rspamd_array_free_hard, words);
  335. }
  336. utext_close (&utxt);
  337. }
  338. rspamd_stat_tokenize_parts_metadata (st_ctx, task);
  339. /* Produce signature */
  340. rspamd_cryptobox_hash_init (&hst, NULL, 0);
  341. PTR_ARRAY_FOREACH (task->tokens, i, st_tok) {
  342. rspamd_cryptobox_hash_update (&hst, (guchar *)&st_tok->data,
  343. sizeof (st_tok->data));
  344. }
  345. rspamd_cryptobox_hash_final (&hst, hout);
  346. b32_hout = rspamd_encode_base32 (hout, sizeof (hout));
  347. /*
  348. * We need to strip it to 32 characters providing ~160 bits of
  349. * hash distribution
  350. */
  351. b32_hout[32] = '\0';
  352. rspamd_mempool_set_variable (task->task_pool, RSPAMD_MEMPOOL_STAT_SIGNATURE,
  353. b32_hout, g_free);
  354. }
  355. static void
  356. rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
  357. struct rspamd_task *task, gboolean learn)
  358. {
  359. guint i;
  360. struct rspamd_statfile *st;
  361. gpointer bk_run;
  362. if (task->tokens == NULL) {
  363. rspamd_stat_process_tokenize (st_ctx, task);
  364. }
  365. task->stat_runtimes = g_ptr_array_sized_new (st_ctx->statfiles->len);
  366. g_ptr_array_set_size (task->stat_runtimes, st_ctx->statfiles->len);
  367. rspamd_mempool_add_destructor (task->task_pool,
  368. rspamd_ptr_array_free_hard, task->stat_runtimes);
  369. for (i = 0; i < st_ctx->statfiles->len; i ++) {
  370. st = g_ptr_array_index (st_ctx->statfiles, i);
  371. g_assert (st != NULL);
  372. if (st->classifier->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  373. g_ptr_array_index (task->stat_runtimes, i) = NULL;
  374. continue;
  375. }
  376. if (!rspamd_symcache_is_symbol_enabled (task, task->cfg->cache,
  377. st->stcf->symbol)) {
  378. g_ptr_array_index (task->stat_runtimes, i) = NULL;
  379. msg_debug_task ("symbol %s is disabled, skip classification",
  380. st->stcf->symbol);
  381. continue;
  382. }
  383. bk_run = st->backend->runtime (task, st->stcf, learn, st->bkcf);
  384. if (bk_run == NULL) {
  385. msg_err_task ("cannot init backend %s for statfile %s",
  386. st->backend->name, st->stcf->symbol);
  387. }
  388. g_ptr_array_index (task->stat_runtimes, i) = bk_run;
  389. }
  390. }
  391. static void
  392. rspamd_stat_backends_process (struct rspamd_stat_ctx *st_ctx,
  393. struct rspamd_task *task)
  394. {
  395. guint i;
  396. struct rspamd_statfile *st;
  397. struct rspamd_classifier *cl;
  398. gpointer bk_run;
  399. g_assert (task->stat_runtimes != NULL);
  400. for (i = 0; i < st_ctx->statfiles->len; i++) {
  401. st = g_ptr_array_index (st_ctx->statfiles, i);
  402. cl = st->classifier;
  403. if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  404. continue;
  405. }
  406. bk_run = g_ptr_array_index (task->stat_runtimes, i);
  407. if (bk_run != NULL) {
  408. st->backend->process_tokens (task, task->tokens, i, bk_run);
  409. }
  410. }
  411. }
  412. static gboolean
  413. rspamd_stat_backends_post_process (struct rspamd_stat_ctx *st_ctx,
  414. struct rspamd_task *task)
  415. {
  416. guint i;
  417. struct rspamd_statfile *st;
  418. struct rspamd_classifier *cl;
  419. gpointer bk_run;
  420. g_assert (task->stat_runtimes != NULL);
  421. for (i = 0; i < st_ctx->statfiles->len; i++) {
  422. st = g_ptr_array_index (st_ctx->statfiles, i);
  423. cl = st->classifier;
  424. if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  425. continue;
  426. }
  427. bk_run = g_ptr_array_index (task->stat_runtimes, i);
  428. if (bk_run != NULL) {
  429. if (!st->backend->finalize_process (task, bk_run, st_ctx)) {
  430. return FALSE;
  431. }
  432. }
  433. }
  434. return TRUE;
  435. }
  436. static void
  437. rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
  438. struct rspamd_task *task)
  439. {
  440. guint i, j, id;
  441. struct rspamd_classifier *cl;
  442. struct rspamd_statfile *st;
  443. gpointer bk_run;
  444. gboolean skip;
  445. if (st_ctx->classifiers->len == 0) {
  446. return;
  447. }
  448. /*
  449. * Do not classify a message if some class is missing
  450. */
  451. if (!(task->flags & RSPAMD_TASK_FLAG_HAS_SPAM_TOKENS)) {
  452. msg_info_task ("skip statistics as SPAM class is missing");
  453. return;
  454. }
  455. if (!(task->flags & RSPAMD_TASK_FLAG_HAS_HAM_TOKENS)) {
  456. msg_info_task ("skip statistics as HAM class is missing");
  457. return;
  458. }
  459. for (i = 0; i < st_ctx->statfiles->len; i++) {
  460. st = g_ptr_array_index (st_ctx->statfiles, i);
  461. cl = st->classifier;
  462. if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  463. continue;
  464. }
  465. bk_run = g_ptr_array_index (task->stat_runtimes, i);
  466. g_assert (st != NULL);
  467. if (bk_run != NULL) {
  468. if (st->stcf->is_spam) {
  469. cl->spam_learns += st->backend->total_learns (task,
  470. bk_run,
  471. st_ctx);
  472. }
  473. else {
  474. cl->ham_learns += st->backend->total_learns (task,
  475. bk_run,
  476. st_ctx);
  477. }
  478. }
  479. }
  480. for (i = 0; i < st_ctx->classifiers->len; i++) {
  481. cl = g_ptr_array_index (st_ctx->classifiers, i);
  482. g_assert (cl != NULL);
  483. /* Ensure that all symbols enabled */
  484. skip = FALSE;
  485. if (!(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
  486. for (j = 0; j < cl->statfiles_ids->len; j++) {
  487. id = g_array_index (cl->statfiles_ids, gint, j);
  488. bk_run = g_ptr_array_index (task->stat_runtimes, id);
  489. st = g_ptr_array_index (st_ctx->statfiles, id);
  490. if (bk_run == NULL) {
  491. skip = TRUE;
  492. msg_debug_task ("disable classifier %s as statfile symbol %s is disabled",
  493. cl->cfg->name, st->stcf->symbol);
  494. break;
  495. }
  496. }
  497. }
  498. if (!skip) {
  499. if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) {
  500. msg_debug_task (
  501. "<%s> contains less tokens than required for %s classifier: "
  502. "%ud < %ud",
  503. task->message_id,
  504. cl->cfg->name,
  505. task->tokens->len,
  506. cl->cfg->min_tokens);
  507. continue;
  508. }
  509. else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) {
  510. msg_debug_task (
  511. "<%s> contains more tokens than allowed for %s classifier: "
  512. "%ud > %ud",
  513. task->message_id,
  514. cl->cfg->name,
  515. task->tokens->len,
  516. cl->cfg->max_tokens);
  517. continue;
  518. }
  519. cl->subrs->classify_func (cl, task->tokens, task);
  520. }
  521. }
  522. }
  523. rspamd_stat_result_t
  524. rspamd_stat_classify (struct rspamd_task *task, lua_State *L, guint stage,
  525. GError **err)
  526. {
  527. struct rspamd_stat_ctx *st_ctx;
  528. rspamd_stat_result_t ret = RSPAMD_STAT_PROCESS_OK;
  529. st_ctx = rspamd_stat_get_ctx ();
  530. g_assert (st_ctx != NULL);
  531. if (st_ctx->classifiers->len == 0) {
  532. task->processed_stages |= stage;
  533. return ret;
  534. }
  535. if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) {
  536. /* Preprocess tokens */
  537. rspamd_stat_preprocess (st_ctx, task, FALSE);
  538. }
  539. else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS) {
  540. /* Process backends */
  541. rspamd_stat_backends_process (st_ctx, task);
  542. }
  543. else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_POST) {
  544. /* Process classifiers */
  545. if (rspamd_stat_backends_post_process (st_ctx, task)) {
  546. rspamd_stat_classifiers_process (st_ctx, task);
  547. }
  548. /* Do not process classifiers on backend failures */
  549. }
  550. task->processed_stages |= stage;
  551. return ret;
  552. }
  553. static gboolean
  554. rspamd_stat_cache_check (struct rspamd_stat_ctx *st_ctx,
  555. struct rspamd_task *task,
  556. const gchar *classifier,
  557. gboolean spam,
  558. GError **err)
  559. {
  560. rspamd_learn_t learn_res = RSPAMD_LEARN_OK;
  561. struct rspamd_classifier *cl, *sel = NULL;
  562. gpointer rt;
  563. guint i;
  564. /* Check whether we have learned that file */
  565. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  566. cl = g_ptr_array_index (st_ctx->classifiers, i);
  567. /* Skip other classifiers if they are not needed */
  568. if (classifier != NULL && (cl->cfg->name == NULL ||
  569. g_ascii_strcasecmp (classifier, cl->cfg->name) != 0)) {
  570. continue;
  571. }
  572. sel = cl;
  573. if (sel->cache && sel->cachecf) {
  574. rt = cl->cache->runtime (task, sel->cachecf, FALSE);
  575. learn_res = cl->cache->check (task, spam, rt);
  576. }
  577. if (learn_res == RSPAMD_LEARN_INGORE) {
  578. /* Do not learn twice */
  579. g_set_error (err, rspamd_stat_quark (), 404, "<%s> has been already "
  580. "learned as %s, ignore it", task->message_id,
  581. spam ? "spam" : "ham");
  582. task->flags |= RSPAMD_TASK_FLAG_ALREADY_LEARNED;
  583. return FALSE;
  584. }
  585. else if (learn_res == RSPAMD_LEARN_UNLEARN) {
  586. task->flags |= RSPAMD_TASK_FLAG_UNLEARN;
  587. break;
  588. }
  589. }
  590. if (sel == NULL) {
  591. if (classifier) {
  592. g_set_error (err, rspamd_stat_quark (), 404, "cannot find classifier "
  593. "with name %s", classifier);
  594. }
  595. else {
  596. g_set_error (err, rspamd_stat_quark (), 404, "no classifiers defined");
  597. }
  598. return FALSE;
  599. }
  600. return TRUE;
  601. }
  602. static gboolean
  603. rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
  604. struct rspamd_task *task,
  605. const gchar *classifier,
  606. gboolean spam,
  607. GError **err)
  608. {
  609. struct rspamd_classifier *cl, *sel = NULL;
  610. guint i;
  611. gboolean learned = FALSE, too_small = FALSE, too_large = FALSE,
  612. conditionally_skipped = FALSE;
  613. lua_State *L;
  614. struct rspamd_task **ptask;
  615. GList *cur;
  616. gint cb_ref;
  617. gchar *cond_str = NULL;
  618. if ((task->flags & RSPAMD_TASK_FLAG_ALREADY_LEARNED) && err != NULL &&
  619. *err == NULL) {
  620. /* Do not learn twice */
  621. g_set_error (err, rspamd_stat_quark (), 404, "<%s> has been already "
  622. "learned as %s, ignore it", task->message_id,
  623. spam ? "spam" : "ham");
  624. return FALSE;
  625. }
  626. /* Check whether we have learned that file */
  627. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  628. cl = g_ptr_array_index (st_ctx->classifiers, i);
  629. /* Skip other classifiers if they are not needed */
  630. if (classifier != NULL && (cl->cfg->name == NULL ||
  631. g_ascii_strcasecmp (classifier, cl->cfg->name) != 0)) {
  632. continue;
  633. }
  634. sel = cl;
  635. /* Now check max and min tokens */
  636. if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) {
  637. msg_info_task (
  638. "<%s> contains less tokens than required for %s classifier: "
  639. "%ud < %ud",
  640. task->message_id,
  641. cl->cfg->name,
  642. task->tokens->len,
  643. cl->cfg->min_tokens);
  644. too_small = TRUE;
  645. continue;
  646. }
  647. else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) {
  648. msg_info_task (
  649. "<%s> contains more tokens than allowed for %s classifier: "
  650. "%ud > %ud",
  651. task->message_id,
  652. cl->cfg->name,
  653. task->tokens->len,
  654. cl->cfg->max_tokens);
  655. too_large = TRUE;
  656. continue;
  657. }
  658. /* Check all conditions for this classifier */
  659. cur = cl->cfg->learn_conditions;
  660. L = task->cfg->lua_state;
  661. while (cur) {
  662. cb_ref = GPOINTER_TO_INT (cur->data);
  663. lua_settop (L, 0);
  664. lua_rawgeti (L, LUA_REGISTRYINDEX, cb_ref);
  665. /* Push task and two booleans: is_spam and is_unlearn */
  666. ptask = lua_newuserdata (L, sizeof (*ptask));
  667. *ptask = task;
  668. rspamd_lua_setclass (L, "rspamd{task}", -1);
  669. lua_pushboolean (L, spam);
  670. lua_pushboolean (L,
  671. task->flags & RSPAMD_TASK_FLAG_UNLEARN ? true : false);
  672. if (lua_pcall (L, 3, LUA_MULTRET, 0) != 0) {
  673. msg_err_task ("call to %s failed: %s",
  674. "condition callback",
  675. lua_tostring (L, -1));
  676. }
  677. else {
  678. if (lua_isboolean (L, 1)) {
  679. if (!lua_toboolean (L, 1)) {
  680. conditionally_skipped = TRUE;
  681. /* Also check for error string if needed */
  682. if (lua_isstring (L, 2)) {
  683. cond_str = rspamd_mempool_strdup (task->task_pool,
  684. lua_tostring (L, 2));
  685. }
  686. lua_settop (L, 0);
  687. break;
  688. }
  689. }
  690. }
  691. lua_settop (L, 0);
  692. cur = g_list_next (cur);
  693. }
  694. if (conditionally_skipped) {
  695. break;
  696. }
  697. if (cl->subrs->learn_spam_func (cl, task->tokens, task, spam,
  698. task->flags & RSPAMD_TASK_FLAG_UNLEARN, err)) {
  699. learned = TRUE;
  700. }
  701. }
  702. if (sel == NULL) {
  703. if (classifier) {
  704. g_set_error (err, rspamd_stat_quark (), 404, "cannot find classifier "
  705. "with name %s", classifier);
  706. }
  707. else {
  708. g_set_error (err, rspamd_stat_quark (), 404, "no classifiers defined");
  709. }
  710. return FALSE;
  711. }
  712. if (!learned && err && *err == NULL) {
  713. if (too_large) {
  714. g_set_error (err, rspamd_stat_quark (), 400,
  715. "<%s> contains more tokens than allowed for %s classifier: "
  716. "%d > %d",
  717. task->message_id,
  718. cl->cfg->name,
  719. task->tokens->len,
  720. cl->cfg->max_tokens);
  721. }
  722. else if (too_small) {
  723. g_set_error (err, rspamd_stat_quark (), 400,
  724. "<%s> contains less tokens than required for %s classifier: "
  725. "%d < %d",
  726. task->message_id,
  727. cl->cfg->name,
  728. task->tokens->len,
  729. cl->cfg->min_tokens);
  730. }
  731. else if (conditionally_skipped) {
  732. g_set_error (err, rspamd_stat_quark (), 410,
  733. "<%s> is skipped for %s classifier: "
  734. "%s",
  735. task->message_id,
  736. cl->cfg->name,
  737. cond_str ? cond_str : "unknown reason");
  738. }
  739. }
  740. return learned;
  741. }
  742. static gboolean
  743. rspamd_stat_backends_learn (struct rspamd_stat_ctx *st_ctx,
  744. struct rspamd_task *task,
  745. const gchar *classifier,
  746. gboolean spam,
  747. GError **err)
  748. {
  749. struct rspamd_classifier *cl, *sel = NULL;
  750. struct rspamd_statfile *st;
  751. gpointer bk_run;
  752. guint i, j;
  753. gint id;
  754. gboolean res = FALSE;
  755. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  756. cl = g_ptr_array_index (st_ctx->classifiers, i);
  757. /* Skip other classifiers if they are not needed */
  758. if (classifier != NULL && (cl->cfg->name == NULL ||
  759. g_ascii_strcasecmp (classifier, cl->cfg->name) != 0)) {
  760. continue;
  761. }
  762. if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  763. res = TRUE;
  764. continue;
  765. }
  766. sel = cl;
  767. for (j = 0; j < cl->statfiles_ids->len; j ++) {
  768. id = g_array_index (cl->statfiles_ids, gint, j);
  769. st = g_ptr_array_index (st_ctx->statfiles, id);
  770. bk_run = g_ptr_array_index (task->stat_runtimes, id);
  771. g_assert (st != NULL);
  772. if (bk_run == NULL) {
  773. /* XXX: must be error */
  774. continue;
  775. }
  776. if (!(task->flags & RSPAMD_TASK_FLAG_UNLEARN)) {
  777. if (!!spam != !!st->stcf->is_spam) {
  778. /* If we are not unlearning, then do not touch another class */
  779. continue;
  780. }
  781. }
  782. if (!st->backend->learn_tokens (task, task->tokens, id, bk_run)) {
  783. if (err && *err == NULL) {
  784. g_set_error (err, rspamd_stat_quark (), 500, "Cannot push "
  785. "learned results to the backend");
  786. }
  787. res = FALSE;
  788. goto end;
  789. }
  790. else {
  791. if (!!spam == !!st->stcf->is_spam) {
  792. st->backend->inc_learns (task, bk_run, st_ctx);
  793. }
  794. else if (task->flags & RSPAMD_TASK_FLAG_UNLEARN) {
  795. st->backend->dec_learns (task, bk_run, st_ctx);
  796. }
  797. res = TRUE;
  798. }
  799. }
  800. }
  801. end:
  802. if (!res && err) {
  803. return res;
  804. }
  805. if (!res && sel == NULL) {
  806. if (classifier) {
  807. g_set_error (err, rspamd_stat_quark (), 404, "cannot find classifier "
  808. "with name %s", classifier);
  809. }
  810. else {
  811. g_set_error (err, rspamd_stat_quark (), 404, "no classifiers defined");
  812. }
  813. return FALSE;
  814. }
  815. if (!res) {
  816. g_set_error (err, rspamd_stat_quark (), 404, "cannot find statfile "
  817. "backend to learn %s in %s", spam ? "spam" : "ham",
  818. classifier ? classifier : "default classifier");
  819. }
  820. return res;
  821. }
  822. static gboolean
  823. rspamd_stat_backends_post_learn (struct rspamd_stat_ctx *st_ctx,
  824. struct rspamd_task *task,
  825. const gchar *classifier,
  826. gboolean spam,
  827. GError **err)
  828. {
  829. struct rspamd_classifier *cl;
  830. struct rspamd_statfile *st;
  831. gpointer bk_run, cache_run;
  832. guint i, j;
  833. gint id;
  834. gboolean res = TRUE;
  835. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  836. cl = g_ptr_array_index (st_ctx->classifiers, i);
  837. /* Skip other classifiers if they are not needed */
  838. if (classifier != NULL && (cl->cfg->name == NULL ||
  839. g_ascii_strcasecmp (classifier, cl->cfg->name) != 0)) {
  840. continue;
  841. }
  842. if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  843. res = TRUE;
  844. continue;
  845. }
  846. for (j = 0; j < cl->statfiles_ids->len; j ++) {
  847. id = g_array_index (cl->statfiles_ids, gint, j);
  848. st = g_ptr_array_index (st_ctx->statfiles, id);
  849. bk_run = g_ptr_array_index (task->stat_runtimes, id);
  850. g_assert (st != NULL);
  851. if (bk_run == NULL) {
  852. /* XXX: must be error */
  853. continue;
  854. }
  855. if (!st->backend->finalize_learn (task, bk_run, st_ctx, err)) {
  856. return RSPAMD_STAT_PROCESS_ERROR;
  857. }
  858. }
  859. if (cl->cache) {
  860. cache_run = cl->cache->runtime (task, cl->cachecf, TRUE);
  861. cl->cache->learn (task, spam, cache_run);
  862. }
  863. }
  864. g_atomic_int_add (&task->worker->srv->stat->messages_learned, 1);
  865. return res;
  866. }
  867. rspamd_stat_result_t
  868. rspamd_stat_learn (struct rspamd_task *task,
  869. gboolean spam, lua_State *L, const gchar *classifier, guint stage,
  870. GError **err)
  871. {
  872. struct rspamd_stat_ctx *st_ctx;
  873. rspamd_stat_result_t ret = RSPAMD_STAT_PROCESS_OK;
  874. /*
  875. * We assume now that a task has been already classified before
  876. * coming to learn
  877. */
  878. g_assert (RSPAMD_TASK_IS_CLASSIFIED (task));
  879. st_ctx = rspamd_stat_get_ctx ();
  880. g_assert (st_ctx != NULL);
  881. if (st_ctx->classifiers->len == 0) {
  882. task->processed_stages |= stage;
  883. return ret;
  884. }
  885. if (stage == RSPAMD_TASK_STAGE_LEARN_PRE) {
  886. /* Process classifiers */
  887. if (!rspamd_stat_cache_check (st_ctx, task, classifier, spam, err)) {
  888. return RSPAMD_STAT_PROCESS_ERROR;
  889. }
  890. }
  891. else if (stage == RSPAMD_TASK_STAGE_LEARN) {
  892. /* Process classifiers */
  893. if (!rspamd_stat_classifiers_learn (st_ctx, task, classifier,
  894. spam, err)) {
  895. return RSPAMD_STAT_PROCESS_ERROR;
  896. }
  897. /* Process backends */
  898. if (!rspamd_stat_backends_learn (st_ctx, task, classifier, spam, err)) {
  899. return RSPAMD_STAT_PROCESS_ERROR;
  900. }
  901. }
  902. else if (stage == RSPAMD_TASK_STAGE_LEARN_POST) {
  903. if (!rspamd_stat_backends_post_learn (st_ctx, task, classifier, spam, err)) {
  904. return RSPAMD_STAT_PROCESS_ERROR;
  905. }
  906. }
  907. task->processed_stages |= stage;
  908. return ret;
  909. }
  910. static gboolean
  911. rspamd_stat_has_classifier_symbols (struct rspamd_task *task,
  912. struct rspamd_metric_result *mres,
  913. struct rspamd_classifier *cl)
  914. {
  915. guint i;
  916. gint id;
  917. struct rspamd_statfile *st;
  918. struct rspamd_stat_ctx *st_ctx;
  919. gboolean is_spam;
  920. if (mres == NULL) {
  921. return FALSE;
  922. }
  923. st_ctx = rspamd_stat_get_ctx ();
  924. is_spam = !!(task->flags & RSPAMD_TASK_FLAG_LEARN_SPAM);
  925. for (i = 0; i < cl->statfiles_ids->len; i ++) {
  926. id = g_array_index (cl->statfiles_ids, gint, i);
  927. st = g_ptr_array_index (st_ctx->statfiles, id);
  928. if (rspamd_task_find_symbol_result (task, st->stcf->symbol)) {
  929. if (is_spam == !!st->stcf->is_spam) {
  930. msg_debug_task ("do not autolearn %s as symbol %s is already "
  931. "added", is_spam ? "spam" : "ham", st->stcf->symbol);
  932. return TRUE;
  933. }
  934. }
  935. }
  936. return FALSE;
  937. }
  938. gboolean
  939. rspamd_stat_check_autolearn (struct rspamd_task *task)
  940. {
  941. struct rspamd_stat_ctx *st_ctx;
  942. struct rspamd_classifier *cl;
  943. const ucl_object_t *obj, *elt1, *elt2;
  944. struct rspamd_metric_result *mres = NULL;
  945. struct rspamd_task **ptask;
  946. lua_State *L;
  947. GString *tb;
  948. guint i;
  949. gint err_idx;
  950. gboolean ret = FALSE;
  951. gdouble ham_score, spam_score;
  952. const gchar *lua_script, *lua_ret;
  953. g_assert (RSPAMD_TASK_IS_CLASSIFIED (task));
  954. st_ctx = rspamd_stat_get_ctx ();
  955. g_assert (st_ctx != NULL);
  956. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  957. cl = g_ptr_array_index (st_ctx->classifiers, i);
  958. ret = FALSE;
  959. if (cl->cfg->opts) {
  960. obj = ucl_object_lookup (cl->cfg->opts, "autolearn");
  961. if (ucl_object_type (obj) == UCL_BOOLEAN) {
  962. if (ucl_object_toboolean (obj)) {
  963. /*
  964. * Default learning algorithm:
  965. *
  966. * - We learn spam if action is ACTION_REJECT
  967. * - We learn ham if score is less than zero
  968. */
  969. mres = task->result;
  970. if (mres) {
  971. if (mres->score > rspamd_task_get_required_score (task, mres)) {
  972. task->flags |= RSPAMD_TASK_FLAG_LEARN_SPAM;
  973. ret = TRUE;
  974. }
  975. else if (mres->score < 0) {
  976. task->flags |= RSPAMD_TASK_FLAG_LEARN_HAM;
  977. ret = TRUE;
  978. }
  979. }
  980. }
  981. }
  982. else if (ucl_object_type (obj) == UCL_ARRAY && obj->len == 2) {
  983. /*
  984. * We have an array of 2 elements, treat it as a
  985. * ham_score, spam_score
  986. */
  987. elt1 = ucl_array_find_index (obj, 0);
  988. elt2 = ucl_array_find_index (obj, 1);
  989. if ((ucl_object_type (elt1) == UCL_FLOAT ||
  990. ucl_object_type (elt1) == UCL_INT) &&
  991. (ucl_object_type (elt2) == UCL_FLOAT ||
  992. ucl_object_type (elt2) == UCL_INT)) {
  993. ham_score = ucl_object_todouble (elt1);
  994. spam_score = ucl_object_todouble (elt2);
  995. if (ham_score > spam_score) {
  996. gdouble t;
  997. t = ham_score;
  998. ham_score = spam_score;
  999. spam_score = t;
  1000. }
  1001. mres = task->result;
  1002. if (mres) {
  1003. if (mres->score >= spam_score) {
  1004. task->flags |= RSPAMD_TASK_FLAG_LEARN_SPAM;
  1005. ret = TRUE;
  1006. }
  1007. else if (mres->score <= ham_score) {
  1008. task->flags |= RSPAMD_TASK_FLAG_LEARN_HAM;
  1009. ret = TRUE;
  1010. }
  1011. }
  1012. }
  1013. }
  1014. else if (ucl_object_type (obj) == UCL_STRING) {
  1015. lua_script = ucl_object_tostring (obj);
  1016. L = task->cfg->lua_state;
  1017. if (luaL_dostring (L, lua_script) != 0) {
  1018. msg_err_task ("cannot execute lua script for autolearn "
  1019. "extraction: %s", lua_tostring (L, -1));
  1020. }
  1021. else {
  1022. if (lua_type (L, -1) == LUA_TFUNCTION) {
  1023. lua_pushcfunction (L, &rspamd_lua_traceback);
  1024. err_idx = lua_gettop (L);
  1025. lua_pushvalue (L, -2); /* Function itself */
  1026. ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
  1027. *ptask = task;
  1028. rspamd_lua_setclass (L, "rspamd{task}", -1);
  1029. if (lua_pcall (L, 1, 1, err_idx) != 0) {
  1030. tb = lua_touserdata (L, -1);
  1031. msg_err_task ("call to autolearn script failed: "
  1032. "%v", tb);
  1033. g_string_free (tb, TRUE);
  1034. }
  1035. else {
  1036. lua_ret = lua_tostring (L, -1);
  1037. if (lua_ret) {
  1038. if (strcmp (lua_ret, "ham") == 0) {
  1039. task->flags |= RSPAMD_TASK_FLAG_LEARN_HAM;
  1040. ret = TRUE;
  1041. }
  1042. else if (strcmp (lua_ret, "spam") == 0) {
  1043. task->flags |= RSPAMD_TASK_FLAG_LEARN_SPAM;
  1044. ret = TRUE;
  1045. }
  1046. }
  1047. }
  1048. /* Result + error function + original function */
  1049. lua_pop (L, 3);
  1050. }
  1051. else {
  1052. msg_err_task ("lua script must return "
  1053. "function(task) and not %s",
  1054. lua_typename (L, lua_type (
  1055. L, -1)));
  1056. }
  1057. }
  1058. }
  1059. if (ret) {
  1060. /* Do not autolearn if we have this symbol already */
  1061. if (rspamd_stat_has_classifier_symbols (task, mres, cl)) {
  1062. ret = FALSE;
  1063. task->flags &= ~(RSPAMD_TASK_FLAG_LEARN_HAM |
  1064. RSPAMD_TASK_FLAG_LEARN_SPAM);
  1065. }
  1066. else if (mres != NULL) {
  1067. if (task->flags & RSPAMD_TASK_FLAG_LEARN_HAM) {
  1068. msg_info_task ("<%s>: autolearn ham for classifier "
  1069. "'%s' as message's "
  1070. "score is negative: %.2f",
  1071. task->message_id, cl->cfg->name,
  1072. mres->score);
  1073. }
  1074. else {
  1075. msg_info_task ("<%s>: autolearn spam for classifier "
  1076. "'%s' as message's "
  1077. "action is reject, score: %.2f",
  1078. task->message_id, cl->cfg->name,
  1079. mres->score);
  1080. }
  1081. task->classifier = cl->cfg->name;
  1082. break;
  1083. }
  1084. }
  1085. }
  1086. }
  1087. return ret;
  1088. }
  1089. /**
  1090. * Get the overall statistics for all statfile backends
  1091. * @param cfg configuration
  1092. * @param total_learns the total number of learns is stored here
  1093. * @return array of statistical information
  1094. */
  1095. rspamd_stat_result_t
  1096. rspamd_stat_statistics (struct rspamd_task *task,
  1097. struct rspamd_config *cfg,
  1098. guint64 *total_learns,
  1099. ucl_object_t **target)
  1100. {
  1101. struct rspamd_stat_ctx *st_ctx;
  1102. struct rspamd_classifier *cl;
  1103. struct rspamd_statfile *st;
  1104. gpointer backend_runtime;
  1105. ucl_object_t *res = NULL, *elt;
  1106. guint64 learns = 0;
  1107. guint i, j;
  1108. gint id;
  1109. st_ctx = rspamd_stat_get_ctx ();
  1110. g_assert (st_ctx != NULL);
  1111. res = ucl_object_typed_new (UCL_ARRAY);
  1112. for (i = 0; i < st_ctx->classifiers->len; i ++) {
  1113. cl = g_ptr_array_index (st_ctx->classifiers, i);
  1114. if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
  1115. continue;
  1116. }
  1117. for (j = 0; j < cl->statfiles_ids->len; j ++) {
  1118. id = g_array_index (cl->statfiles_ids, gint, j);
  1119. st = g_ptr_array_index (st_ctx->statfiles, id);
  1120. backend_runtime = st->backend->runtime (task, st->stcf, FALSE,
  1121. st->bkcf);
  1122. elt = st->backend->get_stat (backend_runtime, st->bkcf);
  1123. if (elt && ucl_object_type (elt) == UCL_OBJECT) {
  1124. const ucl_object_t *rev = ucl_object_lookup (elt, "revision");
  1125. learns += ucl_object_toint (rev);
  1126. }
  1127. else {
  1128. learns += st->backend->total_learns (task, backend_runtime,
  1129. st->bkcf);
  1130. }
  1131. if (elt != NULL) {
  1132. ucl_array_append (res, elt);
  1133. }
  1134. }
  1135. }
  1136. if (total_learns != NULL) {
  1137. *total_learns = learns;
  1138. }
  1139. if (target) {
  1140. *target = res;
  1141. }
  1142. return RSPAMD_STAT_PROCESS_OK;
  1143. }