You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

message.c 38KB


  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "rspamd.h"
  19. #include "message.h"
  20. #include "libserver/html/html.h"
  21. #include "images.h"
  22. #include "archives.h"
  23. #include "tokenizers/tokenizers.h"
  24. #include "smtp_parsers.h"
  25. #include "mime_parser.h"
  26. #include "mime_encoding.h"
  27. #include "lang_detection.h"
  28. #include "libutil/multipattern.h"
  29. #include "libserver/mempool_vars_internal.h"
  30. #ifdef WITH_SNOWBALL
  31. #include "libstemmer.h"
  32. #endif
  33. #include <math.h>
  34. #include <unicode/uchar.h>
  35. #include "sodium.h"
  36. #include "libserver/cfg_file_private.h"
  37. #include "lua/lua_common.h"
  38. #include "contrib/uthash/utlist.h"
  39. #include "contrib/t1ha/t1ha.h"
  40. #include "received.h"
  41. #define GTUBE_SYMBOL "GTUBE"
  42. #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  43. #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  44. static const gchar gtube_pattern_reject[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*"
  45. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  46. static const gchar gtube_pattern_add_header[] = "YJS*C4JDBQADN1.NSBN3*2IDNEN*"
  47. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  48. static const gchar gtube_pattern_rewrite_subject[] = "ZJS*C4JDBQADN1.NSBN3*2IDNEN*"
  49. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  50. static const gchar gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*"
  51. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  52. struct rspamd_multipattern *gtube_matcher = NULL;
  53. static const guint64 words_hash_seed = 0xdeadbabe;
  54. static void
  55. free_byte_array_callback (void *pointer)
  56. {
  57. GByteArray *arr = (GByteArray *) pointer;
  58. g_byte_array_free (arr, TRUE);
  59. }
  60. static void
  61. rspamd_mime_part_extract_words (struct rspamd_task *task,
  62. struct rspamd_mime_text_part *part)
  63. {
  64. rspamd_stat_token_t *w;
  65. guint i, total_len = 0, short_len = 0;
  66. if (part->utf_words) {
  67. rspamd_stem_words (part->utf_words, task->task_pool, part->language,
  68. task->lang_det);
  69. for (i = 0; i < part->utf_words->len; i++) {
  70. guint64 h;
  71. w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
  72. if (w->stemmed.len > 0) {
  73. /*
  74. * We use static hash seed if we would want to use that in shingles
  75. * computation in future
  76. */
  77. h = rspamd_cryptobox_fast_hash_specific (
  78. RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
  79. w->stemmed.begin, w->stemmed.len, words_hash_seed);
  80. g_array_append_val (part->normalized_hashes, h);
  81. total_len += w->stemmed.len;
  82. if (w->stemmed.len <= 3) {
  83. short_len++;
  84. }
  85. if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT &&
  86. !(w->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
  87. part->nwords ++;
  88. }
  89. }
  90. if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE|
  91. RSPAMD_STAT_TOKEN_FLAG_NORMALISED|
  92. RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
  93. task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
  94. }
  95. }
  96. if (part->utf_words->len) {
  97. gdouble *avg_len_p, *short_len_p;
  98. avg_len_p = rspamd_mempool_get_variable (task->task_pool,
  99. RSPAMD_MEMPOOL_AVG_WORDS_LEN);
  100. if (avg_len_p == NULL) {
  101. avg_len_p = rspamd_mempool_alloc (task->task_pool,
  102. sizeof (double));
  103. *avg_len_p = total_len;
  104. rspamd_mempool_set_variable (task->task_pool,
  105. RSPAMD_MEMPOOL_AVG_WORDS_LEN, avg_len_p, NULL);
  106. }
  107. else {
  108. *avg_len_p += total_len;
  109. }
  110. short_len_p = rspamd_mempool_get_variable (task->task_pool,
  111. RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
  112. if (short_len_p == NULL) {
  113. short_len_p = rspamd_mempool_alloc (task->task_pool,
  114. sizeof (double));
  115. *short_len_p = short_len;
  116. rspamd_mempool_set_variable (task->task_pool,
  117. RSPAMD_MEMPOOL_SHORT_WORDS_CNT, avg_len_p, NULL);
  118. }
  119. else {
  120. *short_len_p += short_len;
  121. }
  122. }
  123. }
  124. }
  125. static void
  126. rspamd_mime_part_create_words (struct rspamd_task *task,
  127. struct rspamd_mime_text_part *part)
  128. {
  129. enum rspamd_tokenize_type tok_type;
  130. if (IS_TEXT_PART_UTF (part)) {
  131. #if U_ICU_VERSION_MAJOR_NUM < 50
  132. /* Hack to prevent hang with Thai in old libicu */
  133. const gchar *p = part->utf_stripped_content->data, *end;
  134. guint i = 0;
  135. end = p + part->utf_stripped_content->len;
  136. gint32 uc, sc;
  137. tok_type = RSPAMD_TOKENIZE_UTF;
  138. while (p + i < end) {
  139. U8_NEXT (p, i, part->utf_stripped_content->len, uc);
  140. if (((gint32) uc) < 0) {
  141. tok_type = RSPAMD_TOKENIZE_RAW;
  142. break;
  143. }
  144. if (u_isalpha (uc)) {
  145. sc = ublock_getCode (uc);
  146. if (sc == UBLOCK_THAI) {
  147. msg_info_task ("enable workaround for Thai characters for old libicu");
  148. tok_type = RSPAMD_TOKENIZE_RAW;
  149. break;
  150. }
  151. }
  152. }
  153. #else
  154. tok_type = RSPAMD_TOKENIZE_UTF;
  155. #endif
  156. }
  157. else {
  158. tok_type = RSPAMD_TOKENIZE_RAW;
  159. }
  160. part->utf_words = rspamd_tokenize_text (
  161. part->utf_stripped_content->data,
  162. part->utf_stripped_content->len,
  163. &part->utf_stripped_text,
  164. tok_type, task->cfg,
  165. part->exceptions,
  166. NULL,
  167. NULL,
  168. task->task_pool);
  169. if (part->utf_words) {
  170. part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
  171. sizeof (guint64), part->utf_words->len);
  172. rspamd_normalize_words (part->utf_words, task->task_pool);
  173. }
  174. }
  175. static void
  176. rspamd_mime_part_detect_language (struct rspamd_task *task,
  177. struct rspamd_mime_text_part *part)
  178. {
  179. struct rspamd_lang_detector_res *lang;
  180. if (!IS_TEXT_PART_EMPTY (part) && part->utf_words && part->utf_words->len > 0 &&
  181. task->lang_det) {
  182. if (rspamd_language_detector_detect (task, task->lang_det, part)) {
  183. lang = g_ptr_array_index (part->languages, 0);
  184. part->language = lang->lang;
  185. msg_info_task ("detected part language: %s", part->language);
  186. }
  187. else {
  188. part->language = "en"; /* Safe fallback */
  189. }
  190. }
  191. }
  192. static void
  193. rspamd_strip_newlines_parse (struct rspamd_task *task,
  194. const gchar *begin, const gchar *pe,
  195. struct rspamd_mime_text_part *part)
  196. {
  197. const gchar *p = begin, *c = begin;
  198. gboolean crlf_added = FALSE, is_utf = IS_TEXT_PART_UTF (part);
  199. gboolean url_open_bracket = FALSE;
  200. UChar32 uc;
  201. enum {
  202. normal_char,
  203. seen_cr,
  204. seen_lf,
  205. } state = normal_char;
  206. while (p < pe) {
  207. if (U8_IS_LEAD(*p) && is_utf) {
  208. gint32 off = p - begin;
  209. U8_NEXT (begin, off, pe - begin, uc);
  210. if (uc != -1) {
  211. while (p < pe && off < (pe - begin)) {
  212. if (IS_ZERO_WIDTH_SPACE (uc)) {
  213. /* Invisible space ! */
  214. task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
  215. part->spaces ++;
  216. if (p > c) {
  217. g_byte_array_append (part->utf_stripped_content,
  218. (const guint8 *) c, p - c);
  219. c = begin + off;
  220. p = c;
  221. }
  222. U8_NEXT (begin, off, pe - begin, uc);
  223. if (!IS_ZERO_WIDTH_SPACE (uc)) {
  224. break;
  225. }
  226. part->double_spaces ++;
  227. p = begin + off;
  228. c = p;
  229. }
  230. else {
  231. break;
  232. }
  233. }
  234. }
  235. }
  236. if (G_UNLIKELY (p >= pe)) {
  237. /*
  238. * This is reached when there is a utf8 part and we
  239. * have zero width spaces at the end of the text
  240. * So we just check overflow and refuse to access *p if it is
  241. * after our real content.
  242. */
  243. break;
  244. }
  245. else if (*p == '\r') {
  246. switch (state) {
  247. case normal_char:
  248. state = seen_cr;
  249. if (p > c) {
  250. g_byte_array_append (part->utf_stripped_content,
  251. (const guint8 *)c, p - c);
  252. }
  253. crlf_added = FALSE;
  254. c = p + 1;
  255. break;
  256. case seen_cr:
  257. /* Double \r\r */
  258. if (!crlf_added) {
  259. g_byte_array_append (part->utf_stripped_content,
  260. (const guint8 *)" ", 1);
  261. crlf_added = TRUE;
  262. g_ptr_array_add (part->newlines,
  263. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  264. }
  265. part->nlines ++;
  266. part->empty_lines ++;
  267. c = p + 1;
  268. break;
  269. case seen_lf:
  270. /* Likely \r\n\r...*/
  271. state = seen_cr;
  272. c = p + 1;
  273. break;
  274. }
  275. url_open_bracket = FALSE;
  276. p ++;
  277. }
  278. else if (*p == '\n') {
  279. switch (state) {
  280. case normal_char:
  281. state = seen_lf;
  282. if (p > c) {
  283. g_byte_array_append (part->utf_stripped_content,
  284. (const guint8 *)c, p - c);
  285. }
  286. c = p + 1;
  287. if (IS_TEXT_PART_HTML (part) || !url_open_bracket) {
  288. g_byte_array_append (part->utf_stripped_content,
  289. (const guint8 *)" ", 1);
  290. g_ptr_array_add (part->newlines,
  291. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  292. crlf_added = TRUE;
  293. }
  294. else {
  295. crlf_added = FALSE;
  296. }
  297. break;
  298. case seen_cr:
  299. /* \r\n */
  300. if (!crlf_added) {
  301. if (IS_TEXT_PART_HTML (part) || !url_open_bracket) {
  302. g_byte_array_append (part->utf_stripped_content,
  303. (const guint8 *) " ", 1);
  304. crlf_added = TRUE;
  305. }
  306. g_ptr_array_add (part->newlines,
  307. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  308. }
  309. c = p + 1;
  310. state = seen_lf;
  311. break;
  312. case seen_lf:
  313. /* Double \n\n */
  314. if (!crlf_added) {
  315. g_byte_array_append (part->utf_stripped_content,
  316. (const guint8 *)" ", 1);
  317. crlf_added = TRUE;
  318. g_ptr_array_add (part->newlines,
  319. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  320. }
  321. part->nlines++;
  322. part->empty_lines ++;
  323. c = p + 1;
  324. break;
  325. }
  326. url_open_bracket = FALSE;
  327. p ++;
  328. }
  329. else {
  330. if ((*p) == '<') {
  331. url_open_bracket = TRUE;
  332. }
  333. else if ((*p) == '>') {
  334. url_open_bracket = FALSE;
  335. }
  336. switch (state) {
  337. case normal_char:
  338. if (*p == ' ') {
  339. part->spaces ++;
  340. if (p > begin && *(p - 1) == ' ') {
  341. part->double_spaces ++;
  342. }
  343. }
  344. else {
  345. part->non_spaces ++;
  346. if ((*p) & 0x80) {
  347. part->non_ascii_chars ++;
  348. }
  349. else {
  350. if (g_ascii_isupper (*p)) {
  351. part->capital_letters ++;
  352. }
  353. else if (g_ascii_isdigit (*p)) {
  354. part->numeric_characters ++;
  355. }
  356. part->ascii_chars ++;
  357. }
  358. }
  359. break;
  360. case seen_cr:
  361. case seen_lf:
  362. part->nlines ++;
  363. if (!crlf_added) {
  364. g_ptr_array_add (part->newlines,
  365. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  366. }
  367. /* Skip initial spaces */
  368. if (*p == ' ') {
  369. if (!crlf_added) {
  370. g_byte_array_append (part->utf_stripped_content,
  371. (const guint8 *)" ", 1);
  372. }
  373. while (p < pe && *p == ' ') {
  374. p ++;
  375. c ++;
  376. part->spaces ++;
  377. }
  378. if (p < pe && (*p == '\r' || *p == '\n')) {
  379. part->empty_lines ++;
  380. }
  381. }
  382. state = normal_char;
  383. continue;
  384. }
  385. p ++;
  386. }
  387. }
  388. /* Leftover */
  389. if (p > c) {
  390. if (p > pe) {
  391. p = pe;
  392. }
  393. switch (state) {
  394. case normal_char:
  395. g_byte_array_append (part->utf_stripped_content,
  396. (const guint8 *)c, p - c);
  397. while (c < p) {
  398. if (*c == ' ') {
  399. part->spaces ++;
  400. if (c > begin && *(c - 1) == ' ') {
  401. part->double_spaces ++;
  402. }
  403. }
  404. else {
  405. part->non_spaces ++;
  406. if ((*c) & 0x80) {
  407. part->non_ascii_chars ++;
  408. }
  409. else {
  410. part->ascii_chars ++;
  411. }
  412. }
  413. c ++;
  414. }
  415. break;
  416. default:
  417. if (!crlf_added) {
  418. g_byte_array_append (part->utf_stripped_content,
  419. (const guint8 *)" ", 1);
  420. g_ptr_array_add (part->newlines,
  421. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  422. }
  423. part->nlines++;
  424. break;
  425. }
  426. }
  427. }
  428. static void
  429. rspamd_u_text_dtor (void *p)
  430. {
  431. utext_close ((UText *)p);
  432. }
  433. static void
  434. rspamd_normalize_text_part (struct rspamd_task *task,
  435. struct rspamd_mime_text_part *part)
  436. {
  437. const gchar *p, *end;
  438. guint i;
  439. goffset off;
  440. struct rspamd_process_exception *ex;
  441. UErrorCode uc_err = U_ZERO_ERROR;
  442. part->newlines = g_ptr_array_sized_new (128);
  443. if (IS_TEXT_PART_EMPTY (part)) {
  444. part->utf_stripped_content = g_byte_array_new ();
  445. }
  446. else {
  447. part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len);
  448. p = (const gchar *)part->utf_content.begin;
  449. end = p + part->utf_content.len;
  450. rspamd_strip_newlines_parse (task, p, end, part);
  451. for (i = 0; i < part->newlines->len; i ++) {
  452. ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
  453. off = (goffset)g_ptr_array_index (part->newlines, i);
  454. g_ptr_array_index (part->newlines, i) = (gpointer)(goffset)
  455. (part->utf_stripped_content->data + off);
  456. ex->pos = off;
  457. ex->len = 0;
  458. ex->type = RSPAMD_EXCEPTION_NEWLINE;
  459. part->exceptions = g_list_prepend (part->exceptions, ex);
  460. }
  461. }
  462. if (IS_TEXT_PART_UTF (part)) {
  463. utext_openUTF8 (&part->utf_stripped_text,
  464. part->utf_stripped_content->data,
  465. part->utf_stripped_content->len,
  466. &uc_err);
  467. if (!U_SUCCESS (uc_err)) {
  468. msg_warn_task ("cannot open text from utf content");
  469. /* Probably, should be an assertion */
  470. }
  471. else {
  472. rspamd_mempool_add_destructor (task->task_pool,
  473. rspamd_u_text_dtor,
  474. &part->utf_stripped_text);
  475. }
  476. }
  477. rspamd_mempool_add_destructor (task->task_pool,
  478. (rspamd_mempool_destruct_t) free_byte_array_callback,
  479. part->utf_stripped_content);
  480. rspamd_mempool_notify_alloc (task->task_pool,
  481. part->utf_stripped_content->len);
  482. rspamd_mempool_add_destructor (task->task_pool,
  483. (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
  484. part->newlines);
  485. }
  486. #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
  487. static guint
  488. rspamd_words_levenshtein_distance (struct rspamd_task *task,
  489. GArray *w1, GArray *w2)
  490. {
  491. guint s1len, s2len, x, y, lastdiag, olddiag;
  492. guint *column, ret;
  493. guint64 h1, h2;
  494. gint eq;
  495. static const guint max_words = 8192;
  496. s1len = w1->len;
  497. s2len = w2->len;
  498. if (s1len + s2len > max_words) {
  499. msg_err_task ("cannot compare parts with more than %ud words: (%ud + %ud)",
  500. max_words, s1len, s2len);
  501. return 0;
  502. }
  503. column = g_malloc0 ((s1len + 1) * sizeof (guint));
  504. for (y = 1; y <= s1len; y++) {
  505. column[y] = y;
  506. }
  507. for (x = 1; x <= s2len; x++) {
  508. column[0] = x;
  509. for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
  510. olddiag = column[y];
  511. h1 = g_array_index (w1, guint64, y - 1);
  512. h2 = g_array_index (w2, guint64, x - 1);
  513. eq = (h1 == h2) ? 1 : 0;
  514. /*
  515. * Cost of replacement is twice higher than cost of add/delete
  516. * to calculate percentage properly
  517. */
  518. column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
  519. lastdiag + (eq * 2));
  520. lastdiag = olddiag;
  521. }
  522. }
  523. ret = column[s1len];
  524. g_free (column);
  525. return ret;
  526. }
  527. static gint
  528. rspamd_multipattern_gtube_cb (struct rspamd_multipattern *mp,
  529. guint strnum,
  530. gint match_start,
  531. gint match_pos,
  532. const gchar *text,
  533. gsize len,
  534. void *context)
  535. {
  536. struct rspamd_task *task = (struct rspamd_task *)context;
  537. if (strnum > 0) {
  538. if (task->cfg->enable_test_patterns) {
  539. return strnum + 1;
  540. }
  541. return 0;
  542. }
  543. return strnum + 1; /* To distinguish from zero */
  544. }
  545. static enum rspamd_action_type
  546. rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part)
  547. {
  548. static const gsize max_check_size = 8 * 1024;
  549. gint ret;
  550. enum rspamd_action_type act = METRIC_ACTION_NOACTION;
  551. g_assert (part != NULL);
  552. if (gtube_matcher == NULL) {
  553. gtube_matcher = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT);
  554. rspamd_multipattern_add_pattern (gtube_matcher,
  555. gtube_pattern_reject,
  556. RSPAMD_MULTIPATTERN_DEFAULT);
  557. rspamd_multipattern_add_pattern (gtube_matcher,
  558. gtube_pattern_add_header,
  559. RSPAMD_MULTIPATTERN_DEFAULT);
  560. rspamd_multipattern_add_pattern (gtube_matcher,
  561. gtube_pattern_rewrite_subject,
  562. RSPAMD_MULTIPATTERN_DEFAULT);
  563. rspamd_multipattern_add_pattern (gtube_matcher,
  564. gtube_pattern_no_action,
  565. RSPAMD_MULTIPATTERN_DEFAULT);
  566. g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
  567. }
  568. if (part->utf_content.len >= sizeof (gtube_pattern_reject) &&
  569. part->utf_content.len <= max_check_size) {
  570. if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin,
  571. part->utf_content.len,
  572. rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
  573. switch (ret) {
  574. case 1:
  575. act = METRIC_ACTION_REJECT;
  576. break;
  577. case 2:
  578. g_assert (task->cfg->enable_test_patterns);
  579. act = METRIC_ACTION_ADD_HEADER;
  580. break;
  581. case 3:
  582. g_assert (task->cfg->enable_test_patterns);
  583. act = METRIC_ACTION_REWRITE_SUBJECT;
  584. break;
  585. case 4:
  586. g_assert (task->cfg->enable_test_patterns);
  587. act = METRIC_ACTION_NOACTION;
  588. break;
  589. }
  590. if (ret != 0) {
  591. task->flags |= RSPAMD_TASK_FLAG_SKIP;
  592. task->flags |= RSPAMD_TASK_FLAG_GTUBE;
  593. msg_info_task (
  594. "gtube %s pattern has been found in part of length %uz",
  595. rspamd_action_to_str (act),
  596. part->utf_content.len);
  597. }
  598. }
  599. }
  600. return act;
  601. }
  602. static gint
  603. exceptions_compare_func (gconstpointer a, gconstpointer b)
  604. {
  605. const struct rspamd_process_exception *ea = a, *eb = b;
  606. return ea->pos - eb->pos;
  607. }
  608. static gboolean
  609. rspamd_message_process_plain_text_part (struct rspamd_task *task,
  610. struct rspamd_mime_text_part *text_part)
  611. {
  612. if (text_part->parsed.len == 0) {
  613. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
  614. return TRUE;
  615. }
  616. rspamd_mime_text_part_maybe_convert (task, text_part);
  617. if (text_part->utf_raw_content != NULL) {
  618. /* Just have the same content */
  619. text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data;
  620. text_part->utf_content.len = text_part->utf_raw_content->len;
  621. }
  622. else {
  623. /*
  624. * We ignore unconverted parts from now as it is dangerous
  625. * to treat them as text parts
  626. */
  627. text_part->utf_content.begin = NULL;
  628. text_part->utf_content.len = 0;
  629. return FALSE;
  630. }
  631. return TRUE;
  632. }
  633. static gboolean
  634. rspamd_message_process_html_text_part (struct rspamd_task *task,
  635. struct rspamd_mime_text_part *text_part)
  636. {
  637. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
  638. if (text_part->parsed.len == 0) {
  639. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
  640. return TRUE;
  641. }
  642. rspamd_mime_text_part_maybe_convert (task, text_part);
  643. if (text_part->utf_raw_content == NULL) {
  644. return FALSE;
  645. }
  646. text_part->html = rspamd_html_process_part_full (
  647. task->task_pool,
  648. text_part->utf_raw_content,
  649. &text_part->exceptions,
  650. MESSAGE_FIELD (task, urls),
  651. text_part->mime_part->urls,
  652. task->cfg ? task->cfg->enable_css_parser : true);
  653. rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
  654. if (text_part->utf_content.len == 0) {
  655. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
  656. }
  657. return TRUE;
  658. }
  659. static gboolean
  660. rspamd_message_process_text_part_maybe (struct rspamd_task *task,
  661. struct rspamd_mime_part *mime_part)
  662. {
  663. struct rspamd_mime_text_part *text_part;
  664. rspamd_ftok_t html_tok, xhtml_tok;
  665. gboolean found_html = FALSE, found_txt = FALSE;
  666. guint flags = 0;
  667. enum rspamd_action_type act;
  668. if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) ||
  669. (mime_part->detected_type && strcmp (mime_part->detected_type, "text") == 0)) {
  670. found_txt = TRUE;
  671. html_tok.begin = "html";
  672. html_tok.len = 4;
  673. xhtml_tok.begin = "xhtml";
  674. xhtml_tok.len = 5;
  675. if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0 ||
  676. rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0 ||
  677. (mime_part->detected_ext &&
  678. strcmp (mime_part->detected_ext, "html") == 0)) {
  679. found_html = TRUE;
  680. }
  681. }
  682. /* Skip attachments */
  683. if ((found_txt || found_html) &&
  684. (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
  685. if (!task->cfg->check_text_attachements) {
  686. debug_task ("skip attachments for checking as text parts");
  687. return FALSE;
  688. }
  689. else {
  690. flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT;
  691. }
  692. }
  693. else if (!(found_txt || found_html)) {
  694. /* Not a text part */
  695. return FALSE;
  696. }
  697. text_part = rspamd_mempool_alloc0 (task->task_pool,
  698. sizeof (struct rspamd_mime_text_part));
  699. text_part->mime_part = mime_part;
  700. text_part->raw.begin = mime_part->raw_data.begin;
  701. text_part->raw.len = mime_part->raw_data.len;
  702. text_part->parsed.begin = mime_part->parsed_data.begin;
  703. text_part->parsed.len = mime_part->parsed_data.len;
  704. text_part->utf_stripped_text = (UText)UTEXT_INITIALIZER;
  705. text_part->flags |= flags;
  706. if (found_html) {
  707. if (!rspamd_message_process_html_text_part (task, text_part)) {
  708. return FALSE;
  709. }
  710. }
  711. else {
  712. if (!rspamd_message_process_plain_text_part (task, text_part)) {
  713. return FALSE;
  714. }
  715. }
  716. g_ptr_array_add (MESSAGE_FIELD (task, text_parts), text_part);
  717. mime_part->part_type = RSPAMD_MIME_PART_TEXT;
  718. mime_part->specific.txt = text_part;
  719. act = rspamd_check_gtube (task, text_part);
  720. if (act != METRIC_ACTION_NOACTION) {
  721. struct rspamd_action *action;
  722. gdouble score = NAN;
  723. action = rspamd_config_get_action_by_type (task->cfg, act);
  724. if (action) {
  725. score = action->threshold;
  726. rspamd_add_passthrough_result (task, action,
  727. RSPAMD_PASSTHROUGH_CRITICAL,
  728. score, "Gtube pattern",
  729. "GTUBE", 0, NULL);
  730. }
  731. rspamd_task_insert_result (task, GTUBE_SYMBOL, 0, NULL);
  732. return TRUE;
  733. }
  734. /* Post process part */
  735. rspamd_normalize_text_part (task, text_part);
  736. if (!IS_TEXT_PART_HTML (text_part)) {
  737. if (mime_part->parent_part) {
  738. struct rspamd_mime_part *parent = mime_part->parent_part;
  739. if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) {
  740. /*
  741. * Use strict extraction mode: we will extract missing urls from
  742. * an html part if needed
  743. */
  744. rspamd_url_text_extract (task->task_pool, task, text_part,
  745. RSPAMD_URL_FIND_STRICT);
  746. }
  747. else {
  748. /*
  749. * Fall back to full text extraction using TLD patterns
  750. */
  751. rspamd_url_text_extract (task->task_pool, task, text_part,
  752. RSPAMD_URL_FIND_ALL);
  753. }
  754. }
  755. else {
  756. /*
  757. * Fall back to full text extraction using TLD patterns
  758. */
  759. rspamd_url_text_extract (task->task_pool, task, text_part,
  760. RSPAMD_URL_FIND_ALL);
  761. }
  762. }
  763. else {
  764. rspamd_url_text_extract (task->task_pool, task, text_part,
  765. RSPAMD_URL_FIND_STRICT);
  766. }
  767. if (text_part->exceptions) {
  768. text_part->exceptions = g_list_sort (text_part->exceptions,
  769. exceptions_compare_func);
  770. rspamd_mempool_add_destructor (task->task_pool,
  771. (rspamd_mempool_destruct_t)g_list_free,
  772. text_part->exceptions);
  773. }
  774. rspamd_mime_part_create_words (task, text_part);
  775. return TRUE;
  776. }
  777. /* Creates message from various data using libmagic to detect type */
  778. static void
  779. rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
  780. gsize len)
  781. {
  782. struct rspamd_content_type *ct = NULL;
  783. struct rspamd_mime_part *part;
  784. const char *mb = "application/octet-stream";
  785. gchar *mid;
  786. rspamd_ftok_t srch, *tok;
  787. gchar cdbuf[1024];
  788. g_assert (start != NULL);
  789. part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part));
  790. part->raw_data.begin = start;
  791. part->raw_data.len = len;
  792. part->parsed_data.begin = start;
  793. part->parsed_data.len = len;
  794. part->part_number = MESSAGE_FIELD (task, parts)->len;
  795. part->urls = g_ptr_array_new ();
  796. part->raw_headers = rspamd_message_headers_new ();
  797. part->headers_order = NULL;
  798. tok = rspamd_task_get_request_header (task, "Content-Type");
  799. if (tok) {
  800. /* We have Content-Type defined */
  801. ct = rspamd_content_type_parse (tok->begin, tok->len,
  802. task->task_pool);
  803. part->ct = ct;
  804. }
  805. else if (task->cfg && task->cfg->libs_ctx) {
  806. lua_State *L = task->cfg->lua_state;
  807. if (rspamd_lua_require_function (L,
  808. "lua_magic", "detect_mime_part")) {
  809. struct rspamd_mime_part **pmime;
  810. struct rspamd_task **ptask;
  811. pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
  812. rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
  813. *pmime = part;
  814. ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
  815. rspamd_lua_setclass (L, "rspamd{task}", -1);
  816. *ptask = task;
  817. if (lua_pcall (L, 2, 2, 0) != 0) {
  818. msg_err_task ("cannot detect type: %s", lua_tostring (L, -1));
  819. }
  820. else {
  821. if (lua_istable (L, -1)) {
  822. lua_pushstring (L, "ct");
  823. lua_gettable (L, -2);
  824. if (lua_isstring (L, -1)) {
  825. mb = rspamd_mempool_strdup (task->task_pool,
  826. lua_tostring (L, -1));
  827. }
  828. }
  829. }
  830. lua_settop (L, 0);
  831. }
  832. else {
  833. msg_err_task ("cannot require lua_magic.detect_mime_part");
  834. }
  835. if (mb) {
  836. srch.begin = mb;
  837. srch.len = strlen (mb);
  838. ct = rspamd_content_type_parse (srch.begin, srch.len,
  839. task->task_pool);
  840. if (!part->ct) {
  841. msg_info_task ("construct fake mime of type: %s", mb);
  842. part->ct = ct;
  843. }
  844. else {
  845. /* Check sanity */
  846. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
  847. RSPAMD_FTOK_FROM_STR (&srch, "application");
  848. if (rspamd_ftok_cmp (&ct->type, &srch) == 0) {
  849. msg_info_task ("construct fake mime of type: %s", mb);
  850. part->ct = ct;
  851. }
  852. }
  853. else {
  854. msg_info_task ("construct fake mime of type: %T/%T, detected %s",
  855. &part->ct->type, &part->ct->subtype, mb);
  856. }
  857. }
  858. part->detected_ct = ct;
  859. }
  860. }
  861. tok = rspamd_task_get_request_header (task, "Filename");
  862. if (tok) {
  863. rspamd_snprintf (cdbuf, sizeof (cdbuf), "inline; filename=\"%T\"", tok);
  864. }
  865. else {
  866. rspamd_snprintf (cdbuf, sizeof (cdbuf), "inline");
  867. }
  868. part->cd = rspamd_content_disposition_parse (cdbuf, strlen (cdbuf),
  869. task->task_pool);
  870. g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
  871. rspamd_mime_parser_calc_digest (part);
  872. /* Generate message ID */
  873. mid = rspamd_mime_message_id_generate ("localhost.localdomain");
  874. rspamd_mempool_add_destructor (task->task_pool,
  875. (rspamd_mempool_destruct_t) g_free, mid);
  876. MESSAGE_FIELD (task, message_id) = mid;
  877. task->queue_id = mid;
  878. }
  879. static void
  880. rspamd_message_dtor (struct rspamd_message *msg)
  881. {
  882. guint i;
  883. struct rspamd_mime_part *p;
  884. struct rspamd_mime_text_part *tp;
  885. PTR_ARRAY_FOREACH (msg->parts, i, p) {
  886. if (p->raw_headers) {
  887. rspamd_message_headers_unref (p->raw_headers);
  888. }
  889. if (IS_PART_MULTIPART (p)) {
  890. if (p->specific.mp->children) {
  891. g_ptr_array_free (p->specific.mp->children, TRUE);
  892. }
  893. }
  894. if (p->part_type == RSPAMD_MIME_PART_CUSTOM_LUA &&
  895. p->specific.lua_specific.cbref != -1) {
  896. luaL_unref (msg->task->cfg->lua_state,
  897. LUA_REGISTRYINDEX,
  898. p->specific.lua_specific.cbref);
  899. }
  900. if (p->urls) {
  901. g_ptr_array_unref (p->urls);
  902. }
  903. }
  904. PTR_ARRAY_FOREACH (msg->text_parts, i, tp) {
  905. if (tp->utf_words) {
  906. g_array_free (tp->utf_words, TRUE);
  907. }
  908. if (tp->normalized_hashes) {
  909. g_array_free (tp->normalized_hashes, TRUE);
  910. }
  911. if (tp->languages) {
  912. g_ptr_array_unref (tp->languages);
  913. }
  914. }
  915. rspamd_message_headers_unref (msg->raw_headers);
  916. g_ptr_array_unref (msg->text_parts);
  917. g_ptr_array_unref (msg->parts);
  918. kh_destroy (rspamd_url_hash, msg->urls);
  919. }
  920. struct rspamd_message*
  921. rspamd_message_new (struct rspamd_task *task)
  922. {
  923. struct rspamd_message *msg;
  924. msg = rspamd_mempool_alloc0 (task->task_pool, sizeof (*msg));
  925. msg->raw_headers = rspamd_message_headers_new ();
  926. msg->urls = kh_init (rspamd_url_hash);
  927. msg->parts = g_ptr_array_sized_new (4);
  928. msg->text_parts = g_ptr_array_sized_new (2);
  929. msg->task = task;
  930. REF_INIT_RETAIN (msg, rspamd_message_dtor);
  931. return msg;
  932. }
  933. gboolean
  934. rspamd_message_parse (struct rspamd_task *task)
  935. {
  936. const gchar *p;
  937. gsize len;
  938. guint i;
  939. GError *err = NULL;
  940. guint64 n[2], seed;
  941. if (RSPAMD_TASK_IS_EMPTY (task)) {
  942. /* Don't do anything with empty task */
  943. task->flags |= RSPAMD_TASK_FLAG_SKIP_PROCESS;
  944. return TRUE;
  945. }
  946. p = task->msg.begin;
  947. len = task->msg.len;
  948. /* Skip any space characters to avoid some bad messages to be unparsed */
  949. while (len > 0 && g_ascii_isspace (*p)) {
  950. p ++;
  951. len --;
  952. }
  953. /*
  954. * Exim somehow uses mailbox format for messages being scanned:
  955. * From xxx@xxx.com Fri May 13 19:08:48 2016
  956. *
  957. * So we check if a task has this line to avoid possible issues
  958. */
  959. if (len > sizeof ("From ") - 1) {
  960. if (memcmp (p, "From ", sizeof ("From ") - 1) == 0) {
  961. /* Skip to CRLF */
  962. msg_info_task ("mailbox input detected, enable workaround");
  963. p += sizeof ("From ") - 1;
  964. len -= sizeof ("From ") - 1;
  965. while (len > 0 && *p != '\n') {
  966. p ++;
  967. len --;
  968. }
  969. while (len > 0 && g_ascii_isspace (*p)) {
  970. p ++;
  971. len --;
  972. }
  973. }
  974. }
  975. task->msg.begin = p;
  976. task->msg.len = len;
  977. /* Cleanup old message */
  978. if (task->message) {
  979. rspamd_message_unref (task->message);
  980. }
  981. task->message = rspamd_message_new (task);
  982. if (task->flags & RSPAMD_TASK_FLAG_MIME) {
  983. enum rspamd_mime_parse_error ret;
  984. debug_task ("construct mime parser from string length %d",
  985. (gint) task->msg.len);
  986. ret = rspamd_mime_parse_task (task, &err);
  987. switch (ret) {
  988. case RSPAMD_MIME_PARSE_FATAL:
  989. msg_err_task ("cannot construct mime from stream: %e", err);
  990. if (task->cfg && (!task->cfg->allow_raw_input)) {
  991. msg_err_task ("cannot construct mime from stream");
  992. if (err) {
  993. task->err = err;
  994. }
  995. return FALSE;
  996. }
  997. else {
  998. task->flags &= ~RSPAMD_TASK_FLAG_MIME;
  999. rspamd_message_from_data (task, p, len);
  1000. }
  1001. break;
  1002. case RSPAMD_MIME_PARSE_NESTING:
  1003. msg_warn_task ("cannot construct full mime from stream: %e", err);
  1004. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  1005. break;
  1006. case RSPAMD_MIME_PARSE_OK:
  1007. default:
  1008. break;
  1009. }
  1010. if (err) {
  1011. g_error_free (err);
  1012. }
  1013. }
  1014. else {
  1015. rspamd_message_from_data (task, p, len);
  1016. }
  1017. if (MESSAGE_FIELD (task, message_id) == NULL) {
  1018. MESSAGE_FIELD (task, message_id) = "undef";
  1019. }
  1020. debug_task ("found %ud parts in message", MESSAGE_FIELD (task, parts)->len);
  1021. if (task->queue_id == NULL) {
  1022. task->queue_id = "undef";
  1023. }
  1024. rspamd_received_maybe_fix_task(task);
  1025. struct rspamd_mime_part *part;
  1026. /* Blake2b applied to string 'rspamd' */
  1027. static const guchar RSPAMD_ALIGNED(32) hash_key[] = {
  1028. 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c,
  1029. 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74,
  1030. 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32,
  1031. 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92,
  1032. 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0,
  1033. 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef,
  1034. 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74,
  1035. 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe,
  1036. };
  1037. memcpy (&seed, hash_key, sizeof (seed));
  1038. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
  1039. n[0] = t1ha2_atonce128 (&n[1],
  1040. part->digest, sizeof (part->digest),
  1041. seed);
  1042. seed = n[0] ^ n[1];
  1043. }
  1044. memcpy (MESSAGE_FIELD (task, digest), n, sizeof (n));
  1045. if (MESSAGE_FIELD (task, subject)) {
  1046. p = MESSAGE_FIELD (task, subject);
  1047. len = strlen (p);
  1048. n[0] = t1ha2_atonce128 (&n[1],
  1049. p, len,
  1050. seed);
  1051. memcpy (MESSAGE_FIELD (task, digest), n, sizeof (n));
  1052. }
  1053. if (task->queue_id) {
  1054. msg_info_task ("loaded message; id: <%s>; queue-id: <%s>; size: %z; "
  1055. "checksum: <%*xs>",
  1056. MESSAGE_FIELD (task, message_id), task->queue_id, task->msg.len,
  1057. (gint)sizeof (MESSAGE_FIELD (task, digest)), MESSAGE_FIELD (task, digest));
  1058. }
  1059. else {
  1060. msg_info_task ("loaded message; id: <%s>; size: %z; "
  1061. "checksum: <%*xs>",
  1062. MESSAGE_FIELD (task, message_id), task->msg.len,
  1063. (gint)sizeof (MESSAGE_FIELD (task, digest)), MESSAGE_FIELD (task, digest));
  1064. }
  1065. return TRUE;
  1066. }
  1067. void
  1068. rspamd_message_process (struct rspamd_task *task)
  1069. {
  1070. guint i;
  1071. struct rspamd_mime_text_part *p1, *p2;
  1072. gdouble diff, *pdiff;
  1073. guint tw, *ptw, dw;
  1074. struct rspamd_mime_part *part;
  1075. lua_State *L = NULL;
  1076. gint magic_func_pos = -1, content_func_pos = -1, old_top = -1, funcs_top = -1;
  1077. if (task->cfg) {
  1078. L = task->cfg->lua_state;
  1079. }
  1080. rspamd_archives_process (task);
  1081. if (L) {
  1082. old_top = lua_gettop (L);
  1083. }
  1084. if (L && rspamd_lua_require_function (L,
  1085. "lua_magic", "detect_mime_part")) {
  1086. magic_func_pos = lua_gettop (L);
  1087. }
  1088. else {
  1089. msg_err_task ("cannot require lua_magic.detect_mime_part");
  1090. }
  1091. if (L && rspamd_lua_require_function (L,
  1092. "lua_content", "maybe_process_mime_part")) {
  1093. content_func_pos = lua_gettop (L);
  1094. }
  1095. else {
  1096. msg_err_task ("cannot require lua_content.maybe_process_mime_part");
  1097. }
  1098. if (L) {
  1099. funcs_top = lua_gettop (L);
  1100. }
  1101. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
  1102. if (magic_func_pos != -1 && part->parsed_data.len > 0) {
  1103. struct rspamd_mime_part **pmime;
  1104. struct rspamd_task **ptask;
  1105. lua_pushcfunction (L, &rspamd_lua_traceback);
  1106. gint err_idx = lua_gettop (L);
  1107. lua_pushvalue (L, magic_func_pos);
  1108. pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
  1109. rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
  1110. *pmime = part;
  1111. ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
  1112. rspamd_lua_setclass (L, "rspamd{task}", -1);
  1113. *ptask = task;
  1114. if (lua_pcall (L, 2, 2, err_idx) != 0) {
  1115. msg_err_task ("cannot detect type: %s", lua_tostring (L, -1));
  1116. }
  1117. else {
  1118. if (lua_istable (L, -1)) {
  1119. const gchar *mb;
  1120. /* First returned value */
  1121. part->detected_ext = rspamd_mempool_strdup (task->task_pool,
  1122. lua_tostring (L, -2));
  1123. lua_pushstring (L, "ct");
  1124. lua_gettable (L, -2);
  1125. if (lua_isstring (L, -1)) {
  1126. mb = lua_tostring (L, -1);
  1127. if (mb) {
  1128. rspamd_ftok_t srch;
  1129. srch.begin = mb;
  1130. srch.len = strlen (mb);
  1131. part->detected_ct = rspamd_content_type_parse (srch.begin,
  1132. srch.len,
  1133. task->task_pool);
  1134. }
  1135. }
  1136. lua_pop (L, 1);
  1137. lua_pushstring (L, "type");
  1138. lua_gettable (L, -2);
  1139. if (lua_isstring (L, -1)) {
  1140. part->detected_type = rspamd_mempool_strdup (task->task_pool,
  1141. lua_tostring (L, -1));
  1142. }
  1143. lua_pop (L, 1);
  1144. lua_pushstring (L, "no_text");
  1145. lua_gettable (L, -2);
  1146. if (lua_isboolean (L, -1)) {
  1147. if (!!lua_toboolean (L, -1)) {
  1148. part->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION;
  1149. }
  1150. }
  1151. lua_pop (L, 1);
  1152. }
  1153. }
  1154. lua_settop (L, funcs_top);
  1155. }
  1156. /* Now detect content */
  1157. if (content_func_pos != -1 && part->parsed_data.len > 0 &&
  1158. part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
  1159. struct rspamd_mime_part **pmime;
  1160. struct rspamd_task **ptask;
  1161. lua_pushcfunction (L, &rspamd_lua_traceback);
  1162. gint err_idx = lua_gettop (L);
  1163. lua_pushvalue (L, content_func_pos);
  1164. pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
  1165. rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
  1166. *pmime = part;
  1167. ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
  1168. rspamd_lua_setclass (L, "rspamd{task}", -1);
  1169. *ptask = task;
  1170. if (lua_pcall (L, 2, 0, err_idx) != 0) {
  1171. msg_err_task ("cannot detect content: %s", lua_tostring (L, -1));
  1172. }
  1173. lua_settop (L, funcs_top);
  1174. }
  1175. /* Try to detect image before checking for text */
  1176. rspamd_images_process_mime_part_maybe (task, part);
  1177. /* Still no content detected, try text heuristic */
  1178. if (part->part_type == RSPAMD_MIME_PART_UNDEFINED &&
  1179. !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
  1180. rspamd_message_process_text_part_maybe (task, part);
  1181. }
  1182. }
  1183. if (old_top != -1) {
  1184. lua_settop (L, old_top);
  1185. }
  1186. /* Parse urls inside Subject header */
  1187. if (MESSAGE_FIELD (task, subject)) {
  1188. rspamd_url_find_multiple (task->task_pool, MESSAGE_FIELD (task, subject),
  1189. strlen (MESSAGE_FIELD (task, subject)),
  1190. RSPAMD_URL_FIND_STRICT, NULL,
  1191. rspamd_url_task_subject_callback,
  1192. task);
  1193. }
  1194. /* Calculate average words length and number of short words */
  1195. struct rspamd_mime_text_part *text_part;
  1196. gdouble *var;
  1197. guint total_words = 0;
  1198. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
  1199. if (!text_part->language) {
  1200. rspamd_mime_part_detect_language (task, text_part);
  1201. }
  1202. rspamd_mime_part_extract_words (task, text_part);
  1203. if (text_part->utf_words) {
  1204. total_words += text_part->nwords;
  1205. }
  1206. }
  1207. /* Calculate distance for 2-parts messages */
  1208. if (i == 2) {
  1209. p1 = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), 0);
  1210. p2 = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), 1);
  1211. /* First of all check parent object */
  1212. if (p1->mime_part->parent_part) {
  1213. rspamd_ftok_t srch;
  1214. srch.begin = "alternative";
  1215. srch.len = 11;
  1216. if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
  1217. if (!IS_TEXT_PART_EMPTY (p1) && !IS_TEXT_PART_EMPTY (p2) &&
  1218. p1->normalized_hashes && p2->normalized_hashes) {
  1219. /*
  1220. * We also detect language on one part and propagate it to
  1221. * another one
  1222. */
  1223. struct rspamd_mime_text_part *sel;
  1224. /* Prefer HTML as text part is not displayed normally */
  1225. if (IS_TEXT_PART_HTML (p1)) {
  1226. sel = p1;
  1227. }
  1228. else if (IS_TEXT_PART_HTML (p2)) {
  1229. sel = p2;
  1230. }
  1231. else {
  1232. if (p1->utf_content.len > p2->utf_content.len) {
  1233. sel = p1;
  1234. }
  1235. else {
  1236. sel = p2;
  1237. }
  1238. }
  1239. if (sel->language && sel->language[0]) {
  1240. /* Propagate language */
  1241. if (sel == p1) {
  1242. if (p2->languages) {
  1243. g_ptr_array_unref (p2->languages);
  1244. }
  1245. p2->language = sel->language;
  1246. p2->languages = g_ptr_array_ref (sel->languages);
  1247. }
  1248. else {
  1249. if (p1->languages) {
  1250. g_ptr_array_unref (p1->languages);
  1251. }
  1252. p1->language = sel->language;
  1253. p1->languages = g_ptr_array_ref (sel->languages);
  1254. }
  1255. }
  1256. tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
  1257. if (tw > 0) {
  1258. dw = rspamd_words_levenshtein_distance (task,
  1259. p1->normalized_hashes,
  1260. p2->normalized_hashes);
  1261. diff = dw / (gdouble)tw;
  1262. msg_debug_task (
  1263. "different words: %d, total words: %d, "
  1264. "got diff between parts of %.2f",
  1265. dw, tw,
  1266. diff);
  1267. pdiff = rspamd_mempool_alloc (task->task_pool,
  1268. sizeof (gdouble));
  1269. *pdiff = diff;
  1270. rspamd_mempool_set_variable (task->task_pool,
  1271. "parts_distance",
  1272. pdiff,
  1273. NULL);
  1274. ptw = rspamd_mempool_alloc (task->task_pool,
  1275. sizeof (gint));
  1276. *ptw = tw;
  1277. rspamd_mempool_set_variable (task->task_pool,
  1278. "total_words",
  1279. ptw,
  1280. NULL);
  1281. }
  1282. }
  1283. }
  1284. }
  1285. else {
  1286. debug_task (
  1287. "message contains two parts but they are in different multi-parts");
  1288. }
  1289. }
  1290. if (total_words > 0) {
  1291. var = rspamd_mempool_get_variable (task->task_pool,
  1292. RSPAMD_MEMPOOL_AVG_WORDS_LEN);
  1293. if (var) {
  1294. *var /= (double)total_words;
  1295. }
  1296. var = rspamd_mempool_get_variable (task->task_pool,
  1297. RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
  1298. if (var) {
  1299. *var /= (double)total_words;
  1300. }
  1301. }
  1302. rspamd_images_link (task);
  1303. rspamd_tokenize_meta_words (task);
  1304. }
  1305. struct rspamd_message *
  1306. rspamd_message_ref (struct rspamd_message *msg)
  1307. {
  1308. REF_RETAIN (msg);
  1309. return msg;
  1310. }
  1311. void rspamd_message_unref (struct rspamd_message *msg)
  1312. {
  1313. if (msg) {
  1314. REF_RELEASE (msg);
  1315. }
  1316. }
  1317. void rspamd_message_update_digest (struct rspamd_message *msg,
  1318. const void *input, gsize len)
  1319. {
  1320. guint64 n[2];
  1321. /* Sanity */
  1322. G_STATIC_ASSERT (sizeof (n) == sizeof (msg->digest));
  1323. memcpy (n, msg->digest, sizeof (msg->digest));
  1324. n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]);
  1325. memcpy (msg->digest, n, sizeof (msg->digest));
  1326. }