You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

message.c 41KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "rspamd.h"
  19. #include "message.h"
  20. #include "libserver/html/html.h"
  21. #include "images.h"
  22. #include "archives.h"
  23. #include "tokenizers/tokenizers.h"
  24. #include "smtp_parsers.h"
  25. #include "mime_parser.h"
  26. #include "mime_encoding.h"
  27. #include "lang_detection.h"
  28. #include "libutil/multipattern.h"
  29. #include "libserver/mempool_vars_internal.h"
  30. #ifdef WITH_SNOWBALL
  31. #include "libstemmer.h"
  32. #endif
  33. #include <math.h>
  34. #include <unicode/uchar.h>
  35. #include "sodium.h"
  36. #include "libserver/cfg_file_private.h"
  37. #include "lua/lua_common.h"
  38. #include "contrib/uthash/utlist.h"
  39. #include "contrib/t1ha/t1ha.h"
  40. #include "received.h"
  41. #define GTUBE_SYMBOL "GTUBE"
  42. #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  43. #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  44. static const char gtube_pattern_reject[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*"
  45. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  46. static const char gtube_pattern_add_header[] = "YJS*C4JDBQADN1.NSBN3*2IDNEN*"
  47. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  48. static const char gtube_pattern_rewrite_subject[] = "ZJS*C4JDBQADN1.NSBN3*2IDNEN*"
  49. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  50. static const char gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*"
  51. "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
  52. struct rspamd_multipattern *gtube_matcher = NULL;
  53. static const uint64_t words_hash_seed = 0xdeadbabe;
  54. static void
  55. free_byte_array_callback(void *pointer)
  56. {
  57. GByteArray *arr = (GByteArray *) pointer;
  58. g_byte_array_free(arr, TRUE);
  59. }
  60. static void
  61. rspamd_mime_part_extract_words(struct rspamd_task *task,
  62. struct rspamd_mime_text_part *part)
  63. {
  64. rspamd_stat_token_t *w;
  65. unsigned int i, total_len = 0, short_len = 0;
  66. if (part->utf_words) {
  67. rspamd_stem_words(part->utf_words, task->task_pool, part->language,
  68. task->lang_det);
  69. for (i = 0; i < part->utf_words->len; i++) {
  70. uint64_t h;
  71. w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
  72. if (w->stemmed.len > 0) {
  73. /*
  74. * We use static hash seed if we would want to use that in shingles
  75. * computation in future
  76. */
  77. h = rspamd_cryptobox_fast_hash_specific(
  78. RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
  79. w->stemmed.begin, w->stemmed.len, words_hash_seed);
  80. g_array_append_val(part->normalized_hashes, h);
  81. total_len += w->stemmed.len;
  82. if (w->stemmed.len <= 3) {
  83. short_len++;
  84. }
  85. if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT &&
  86. !(w->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
  87. part->nwords++;
  88. }
  89. }
  90. if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE |
  91. RSPAMD_STAT_TOKEN_FLAG_NORMALISED |
  92. RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
  93. task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
  94. }
  95. }
  96. if (part->utf_words->len) {
  97. double *avg_len_p, *short_len_p;
  98. avg_len_p = rspamd_mempool_get_variable(task->task_pool,
  99. RSPAMD_MEMPOOL_AVG_WORDS_LEN);
  100. if (avg_len_p == NULL) {
  101. avg_len_p = rspamd_mempool_alloc(task->task_pool,
  102. sizeof(double));
  103. *avg_len_p = total_len;
  104. rspamd_mempool_set_variable(task->task_pool,
  105. RSPAMD_MEMPOOL_AVG_WORDS_LEN, avg_len_p, NULL);
  106. }
  107. else {
  108. *avg_len_p += total_len;
  109. }
  110. short_len_p = rspamd_mempool_get_variable(task->task_pool,
  111. RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
  112. if (short_len_p == NULL) {
  113. short_len_p = rspamd_mempool_alloc(task->task_pool,
  114. sizeof(double));
  115. *short_len_p = short_len;
  116. rspamd_mempool_set_variable(task->task_pool,
  117. RSPAMD_MEMPOOL_SHORT_WORDS_CNT, avg_len_p, NULL);
  118. }
  119. else {
  120. *short_len_p += short_len;
  121. }
  122. }
  123. }
  124. }
  125. static void
  126. rspamd_mime_part_create_words(struct rspamd_task *task,
  127. struct rspamd_mime_text_part *part)
  128. {
  129. enum rspamd_tokenize_type tok_type;
  130. if (IS_TEXT_PART_UTF(part)) {
  131. #if U_ICU_VERSION_MAJOR_NUM < 50
  132. /* Hack to prevent hang with Thai in old libicu */
  133. const char *p = part->utf_stripped_content->data, *end;
  134. unsigned int i = 0;
  135. end = p + part->utf_stripped_content->len;
  136. int32_t uc, sc;
  137. tok_type = RSPAMD_TOKENIZE_UTF;
  138. while (p + i < end) {
  139. U8_NEXT(p, i, part->utf_stripped_content->len, uc);
  140. if (((int32_t) uc) < 0) {
  141. tok_type = RSPAMD_TOKENIZE_RAW;
  142. break;
  143. }
  144. if (u_isalpha(uc)) {
  145. sc = ublock_getCode(uc);
  146. if (sc == UBLOCK_THAI) {
  147. msg_info_task("enable workaround for Thai characters for old libicu");
  148. tok_type = RSPAMD_TOKENIZE_RAW;
  149. break;
  150. }
  151. }
  152. }
  153. #else
  154. tok_type = RSPAMD_TOKENIZE_UTF;
  155. #endif
  156. }
  157. else {
  158. tok_type = RSPAMD_TOKENIZE_RAW;
  159. }
  160. part->utf_words = rspamd_tokenize_text(
  161. part->utf_stripped_content->data,
  162. part->utf_stripped_content->len,
  163. &part->utf_stripped_text,
  164. tok_type, task->cfg,
  165. part->exceptions,
  166. NULL,
  167. NULL,
  168. task->task_pool);
  169. if (part->utf_words) {
  170. part->normalized_hashes = g_array_sized_new(FALSE, FALSE,
  171. sizeof(uint64_t), part->utf_words->len);
  172. rspamd_normalize_words(part->utf_words, task->task_pool);
  173. }
  174. }
  175. static void
  176. rspamd_mime_part_detect_language(struct rspamd_task *task,
  177. struct rspamd_mime_text_part *part)
  178. {
  179. struct rspamd_lang_detector_res *lang;
  180. if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 &&
  181. task->lang_det) {
  182. if (rspamd_language_detector_detect(task, task->lang_det, part)) {
  183. lang = g_ptr_array_index(part->languages, 0);
  184. part->language = lang->lang;
  185. msg_info_task("detected part language: %s", part->language);
  186. }
  187. else {
  188. part->language = "en"; /* Safe fallback */
  189. }
  190. }
  191. }
  192. static void
  193. rspamd_strip_newlines_parse(struct rspamd_task *task,
  194. const char *begin, const char *pe,
  195. struct rspamd_mime_text_part *part)
  196. {
  197. const char *p = begin, *c = begin;
  198. gboolean crlf_added = FALSE, is_utf = IS_TEXT_PART_UTF(part);
  199. gboolean url_open_bracket = FALSE;
  200. UChar32 uc;
  201. enum {
  202. normal_char,
  203. seen_cr,
  204. seen_lf,
  205. } state = normal_char;
  206. while (p < pe) {
  207. if (U8_IS_LEAD(*p) && is_utf) {
  208. int32_t off = p - begin;
  209. U8_NEXT(begin, off, pe - begin, uc);
  210. if (uc != -1) {
  211. while (p < pe && off < (pe - begin)) {
  212. if (IS_ZERO_WIDTH_SPACE(uc)) {
  213. /* Invisible space ! */
  214. task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
  215. part->spaces++;
  216. if (p > c) {
  217. g_byte_array_append(part->utf_stripped_content,
  218. (const uint8_t *) c, p - c);
  219. c = begin + off;
  220. p = c;
  221. }
  222. U8_NEXT(begin, off, pe - begin, uc);
  223. if (!IS_ZERO_WIDTH_SPACE(uc)) {
  224. break;
  225. }
  226. part->double_spaces++;
  227. p = begin + off;
  228. c = p;
  229. }
  230. else {
  231. break;
  232. }
  233. }
  234. }
  235. }
  236. if (G_UNLIKELY(p >= pe)) {
  237. /*
  238. * This is reached when there is a utf8 part and we
  239. * have zero width spaces at the end of the text
  240. * So we just check overflow and refuse to access *p if it is
  241. * after our real content.
  242. */
  243. break;
  244. }
  245. else if (*p == '\r') {
  246. switch (state) {
  247. case normal_char:
  248. state = seen_cr;
  249. if (p > c) {
  250. g_byte_array_append(part->utf_stripped_content,
  251. (const uint8_t *) c, p - c);
  252. }
  253. crlf_added = FALSE;
  254. c = p + 1;
  255. break;
  256. case seen_cr:
  257. /* Double \r\r */
  258. if (!crlf_added) {
  259. g_byte_array_append(part->utf_stripped_content,
  260. (const uint8_t *) " ", 1);
  261. crlf_added = TRUE;
  262. g_ptr_array_add(part->newlines,
  263. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  264. }
  265. part->nlines++;
  266. part->empty_lines++;
  267. c = p + 1;
  268. break;
  269. case seen_lf:
  270. /* Likely \r\n\r...*/
  271. state = seen_cr;
  272. c = p + 1;
  273. break;
  274. }
  275. url_open_bracket = FALSE;
  276. p++;
  277. }
  278. else if (*p == '\n') {
  279. switch (state) {
  280. case normal_char:
  281. state = seen_lf;
  282. if (p > c) {
  283. g_byte_array_append(part->utf_stripped_content,
  284. (const uint8_t *) c, p - c);
  285. }
  286. c = p + 1;
  287. if (IS_TEXT_PART_HTML(part) || !url_open_bracket) {
  288. g_byte_array_append(part->utf_stripped_content,
  289. (const uint8_t *) " ", 1);
  290. g_ptr_array_add(part->newlines,
  291. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  292. crlf_added = TRUE;
  293. }
  294. else {
  295. crlf_added = FALSE;
  296. }
  297. break;
  298. case seen_cr:
  299. /* \r\n */
  300. if (!crlf_added) {
  301. if (IS_TEXT_PART_HTML(part) || !url_open_bracket) {
  302. g_byte_array_append(part->utf_stripped_content,
  303. (const uint8_t *) " ", 1);
  304. crlf_added = TRUE;
  305. }
  306. g_ptr_array_add(part->newlines,
  307. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  308. }
  309. c = p + 1;
  310. state = seen_lf;
  311. break;
  312. case seen_lf:
  313. /* Double \n\n */
  314. if (!crlf_added) {
  315. g_byte_array_append(part->utf_stripped_content,
  316. (const uint8_t *) " ", 1);
  317. crlf_added = TRUE;
  318. g_ptr_array_add(part->newlines,
  319. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  320. }
  321. part->nlines++;
  322. part->empty_lines++;
  323. c = p + 1;
  324. break;
  325. }
  326. url_open_bracket = FALSE;
  327. p++;
  328. }
  329. else {
  330. if ((*p) == '<') {
  331. url_open_bracket = TRUE;
  332. }
  333. else if ((*p) == '>') {
  334. url_open_bracket = FALSE;
  335. }
  336. switch (state) {
  337. case normal_char:
  338. if (*p == ' ') {
  339. part->spaces++;
  340. if (p > begin && *(p - 1) == ' ') {
  341. part->double_spaces++;
  342. }
  343. }
  344. else {
  345. part->non_spaces++;
  346. if ((*p) & 0x80) {
  347. part->non_ascii_chars++;
  348. }
  349. else {
  350. if (g_ascii_isupper(*p)) {
  351. part->capital_letters++;
  352. }
  353. else if (g_ascii_isdigit(*p)) {
  354. part->numeric_characters++;
  355. }
  356. part->ascii_chars++;
  357. }
  358. }
  359. break;
  360. case seen_cr:
  361. case seen_lf:
  362. part->nlines++;
  363. if (!crlf_added) {
  364. g_ptr_array_add(part->newlines,
  365. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  366. }
  367. /* Skip initial spaces */
  368. if (*p == ' ') {
  369. if (!crlf_added) {
  370. g_byte_array_append(part->utf_stripped_content,
  371. (const uint8_t *) " ", 1);
  372. }
  373. while (p < pe && *p == ' ') {
  374. p++;
  375. c++;
  376. part->spaces++;
  377. }
  378. if (p < pe && (*p == '\r' || *p == '\n')) {
  379. part->empty_lines++;
  380. }
  381. }
  382. state = normal_char;
  383. continue;
  384. }
  385. p++;
  386. }
  387. }
  388. /* Leftover */
  389. if (p > c) {
  390. if (p > pe) {
  391. p = pe;
  392. }
  393. switch (state) {
  394. case normal_char:
  395. g_byte_array_append(part->utf_stripped_content,
  396. (const uint8_t *) c, p - c);
  397. while (c < p) {
  398. if (*c == ' ') {
  399. part->spaces++;
  400. if (c > begin && *(c - 1) == ' ') {
  401. part->double_spaces++;
  402. }
  403. }
  404. else {
  405. part->non_spaces++;
  406. if ((*c) & 0x80) {
  407. part->non_ascii_chars++;
  408. }
  409. else {
  410. part->ascii_chars++;
  411. }
  412. }
  413. c++;
  414. }
  415. break;
  416. default:
  417. if (!crlf_added) {
  418. g_byte_array_append(part->utf_stripped_content,
  419. (const uint8_t *) " ", 1);
  420. g_ptr_array_add(part->newlines,
  421. (((gpointer) (goffset) (part->utf_stripped_content->len))));
  422. }
  423. part->nlines++;
  424. break;
  425. }
  426. }
  427. }
  428. static void
  429. rspamd_u_text_dtor(void *p)
  430. {
  431. utext_close((UText *) p);
  432. }
  433. static void
  434. rspamd_normalize_text_part(struct rspamd_task *task,
  435. struct rspamd_mime_text_part *part)
  436. {
  437. const char *p, *end;
  438. unsigned int i;
  439. goffset off;
  440. struct rspamd_process_exception *ex;
  441. UErrorCode uc_err = U_ZERO_ERROR;
  442. part->newlines = g_ptr_array_sized_new(128);
  443. if (IS_TEXT_PART_EMPTY(part)) {
  444. part->utf_stripped_content = g_byte_array_new();
  445. }
  446. else {
  447. part->utf_stripped_content = g_byte_array_sized_new(part->utf_content.len);
  448. p = (const char *) part->utf_content.begin;
  449. end = p + part->utf_content.len;
  450. rspamd_strip_newlines_parse(task, p, end, part);
  451. for (i = 0; i < part->newlines->len; i++) {
  452. ex = rspamd_mempool_alloc(task->task_pool, sizeof(*ex));
  453. off = (goffset) g_ptr_array_index(part->newlines, i);
  454. g_ptr_array_index(part->newlines, i) = (gpointer) (goffset) (part->utf_stripped_content->data + off);
  455. ex->pos = off;
  456. ex->len = 0;
  457. ex->type = RSPAMD_EXCEPTION_NEWLINE;
  458. part->exceptions = g_list_prepend(part->exceptions, ex);
  459. }
  460. }
  461. if (IS_TEXT_PART_UTF(part)) {
  462. utext_openUTF8(&part->utf_stripped_text,
  463. part->utf_stripped_content->data,
  464. part->utf_stripped_content->len,
  465. &uc_err);
  466. if (!U_SUCCESS(uc_err)) {
  467. msg_warn_task("cannot open text from utf content");
  468. /* Probably, should be an assertion */
  469. }
  470. else {
  471. rspamd_mempool_add_destructor(task->task_pool,
  472. rspamd_u_text_dtor,
  473. &part->utf_stripped_text);
  474. }
  475. }
  476. rspamd_mempool_add_destructor(task->task_pool,
  477. (rspamd_mempool_destruct_t) free_byte_array_callback,
  478. part->utf_stripped_content);
  479. rspamd_mempool_notify_alloc(task->task_pool,
  480. part->utf_stripped_content->len);
  481. rspamd_mempool_add_destructor(task->task_pool,
  482. (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
  483. part->newlines);
  484. }
  485. #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
  486. static unsigned int
  487. rspamd_words_levenshtein_distance(struct rspamd_task *task,
  488. GArray *w1, GArray *w2)
  489. {
  490. unsigned int s1len, s2len, x, y, lastdiag, olddiag;
  491. unsigned int *column, ret;
  492. uint64_t h1, h2;
  493. int eq;
  494. static const unsigned int max_words = 8192;
  495. s1len = w1->len;
  496. s2len = w2->len;
  497. if (s1len + s2len > max_words) {
  498. msg_info_task("cannot direct compare multipart/alternative parts with more than %ud words in total: "
  499. "(%ud words in one part and %ud in another)",
  500. max_words, s1len, s2len);
  501. /* Use approximate comparison of number of words */
  502. if (s1len > s2len) {
  503. return s1len - s2len;
  504. }
  505. else {
  506. return s2len - s1len;
  507. }
  508. }
  509. column = g_malloc0((s1len + 1) * sizeof(unsigned int));
  510. for (y = 1; y <= s1len; y++) {
  511. column[y] = y;
  512. }
  513. for (x = 1; x <= s2len; x++) {
  514. column[0] = x;
  515. for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
  516. olddiag = column[y];
  517. h1 = g_array_index(w1, uint64_t, y - 1);
  518. h2 = g_array_index(w2, uint64_t, x - 1);
  519. eq = (h1 == h2) ? 1 : 0;
  520. /*
  521. * Cost of replacement is twice higher than cost of add/delete
  522. * to calculate percentage properly
  523. */
  524. column[y] = MIN3(column[y] + 1, column[y - 1] + 1,
  525. lastdiag + (eq * 2));
  526. lastdiag = olddiag;
  527. }
  528. }
  529. ret = column[s1len];
  530. g_free(column);
  531. return ret;
  532. }
  533. static int
  534. rspamd_multipattern_gtube_cb(struct rspamd_multipattern *mp,
  535. unsigned int strnum,
  536. int match_start,
  537. int match_pos,
  538. const char *text,
  539. gsize len,
  540. void *context)
  541. {
  542. struct rspamd_task *task = (struct rspamd_task *) context;
  543. if (strnum > 0) {
  544. if (task->cfg->gtube_patterns_policy == RSPAMD_GTUBE_ALL) {
  545. return strnum + 1;
  546. }
  547. return 0;
  548. }
  549. return strnum + 1; /* To distinguish from zero */
  550. }
  551. static enum rspamd_action_type
  552. rspamd_check_gtube(struct rspamd_task *task, struct rspamd_mime_text_part *part)
  553. {
  554. static const gsize max_check_size = 8 * 1024;
  555. int ret;
  556. enum rspamd_action_type act = METRIC_ACTION_NOACTION;
  557. enum rspamd_gtube_patterns_policy policy = task->cfg ? task->cfg->gtube_patterns_policy : RSPAMD_GTUBE_REJECT;
  558. g_assert(part != NULL);
  559. if (gtube_matcher == NULL && policy != RSPAMD_GTUBE_DISABLED) {
  560. gtube_matcher = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
  561. rspamd_multipattern_add_pattern(gtube_matcher,
  562. gtube_pattern_reject,
  563. RSPAMD_MULTIPATTERN_DEFAULT);
  564. rspamd_multipattern_add_pattern(gtube_matcher,
  565. gtube_pattern_add_header,
  566. RSPAMD_MULTIPATTERN_DEFAULT);
  567. rspamd_multipattern_add_pattern(gtube_matcher,
  568. gtube_pattern_rewrite_subject,
  569. RSPAMD_MULTIPATTERN_DEFAULT);
  570. rspamd_multipattern_add_pattern(gtube_matcher,
  571. gtube_pattern_no_action,
  572. RSPAMD_MULTIPATTERN_DEFAULT);
  573. GError *err = NULL;
  574. rspamd_multipattern_compile(gtube_matcher, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err);
  575. if (err != NULL) {
  576. /* It will be expensive, but I don't care, still better than to abort */
  577. msg_err("cannot compile gtube matcher: %s", err->message);
  578. g_error_free(err);
  579. }
  580. }
  581. if (part->utf_content.len >= sizeof(gtube_pattern_reject) &&
  582. part->utf_content.len <= max_check_size &&
  583. policy != RSPAMD_GTUBE_DISABLED) {
  584. if ((ret = rspamd_multipattern_lookup(gtube_matcher, part->utf_content.begin,
  585. part->utf_content.len,
  586. rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
  587. switch (ret) {
  588. case 1:
  589. act = METRIC_ACTION_REJECT;
  590. break;
  591. case 2:
  592. act = METRIC_ACTION_ADD_HEADER;
  593. break;
  594. case 3:
  595. act = METRIC_ACTION_REWRITE_SUBJECT;
  596. break;
  597. case 4:
  598. act = METRIC_ACTION_NOACTION;
  599. break;
  600. }
  601. if (ret != 0) {
  602. task->flags |= RSPAMD_TASK_FLAG_SKIP;
  603. task->flags |= RSPAMD_TASK_FLAG_GTUBE;
  604. msg_info_task(
  605. "gtube %s pattern has been found in part of length %uz",
  606. rspamd_action_to_str(act),
  607. part->utf_content.len);
  608. }
  609. }
  610. }
  611. return act;
  612. }
  613. static int
  614. exceptions_compare_func(gconstpointer a, gconstpointer b)
  615. {
  616. const struct rspamd_process_exception *ea = a, *eb = b;
  617. return ea->pos - eb->pos;
  618. }
  619. static gboolean
  620. rspamd_message_process_plain_text_part(struct rspamd_task *task,
  621. struct rspamd_mime_text_part *text_part)
  622. {
  623. if (text_part->parsed.len == 0) {
  624. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
  625. return TRUE;
  626. }
  627. rspamd_mime_text_part_maybe_convert(task, text_part);
  628. if (text_part->utf_raw_content != NULL) {
  629. /* Just have the same content */
  630. text_part->utf_content.begin = (const char *) text_part->utf_raw_content->data;
  631. text_part->utf_content.len = text_part->utf_raw_content->len;
  632. }
  633. else {
  634. /*
  635. * We ignore unconverted parts from now as it is dangerous
  636. * to treat them as text parts
  637. */
  638. text_part->utf_content.begin = NULL;
  639. text_part->utf_content.len = 0;
  640. return FALSE;
  641. }
  642. return TRUE;
  643. }
  644. static gboolean
  645. rspamd_message_process_html_text_part(struct rspamd_task *task,
  646. struct rspamd_mime_text_part *text_part,
  647. uint16_t *cur_url_order)
  648. {
  649. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
  650. if (text_part->parsed.len == 0) {
  651. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
  652. return TRUE;
  653. }
  654. rspamd_mime_text_part_maybe_convert(task, text_part);
  655. if (text_part->utf_raw_content == NULL) {
  656. return FALSE;
  657. }
  658. text_part->html = rspamd_html_process_part_full(
  659. task,
  660. text_part->utf_raw_content,
  661. &text_part->exceptions,
  662. MESSAGE_FIELD(task, urls),
  663. text_part->mime_part->urls,
  664. task->cfg ? task->cfg->enable_css_parser : true,
  665. cur_url_order);
  666. rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
  667. if (text_part->utf_content.len == 0) {
  668. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
  669. }
  670. return TRUE;
  671. }
  672. enum rspamd_message_part_is_text_result {
  673. RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN = 0,
  674. RSPAMD_MESSAGE_PART_IS_TEXT_HTML,
  675. RSPAMD_MESSAGE_PART_IS_NOT_TEXT
  676. };
  677. static enum rspamd_message_part_is_text_result
  678. rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task,
  679. struct rspamd_mime_part *mime_part)
  680. {
  681. enum rspamd_message_part_is_text_result res = RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
  682. if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) ||
  683. (mime_part->detected_type && strcmp(mime_part->detected_type, "text") == 0)) {
  684. res = RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN;
  685. rspamd_ftok_t html_tok, xhtml_tok;
  686. html_tok.begin = "html";
  687. html_tok.len = 4;
  688. xhtml_tok.begin = "xhtml";
  689. xhtml_tok.len = 5;
  690. if (rspamd_ftok_casecmp(&mime_part->ct->subtype, &html_tok) == 0 ||
  691. rspamd_ftok_casecmp(&mime_part->ct->subtype, &xhtml_tok) == 0 ||
  692. (mime_part->detected_ext &&
  693. strcmp(mime_part->detected_ext, "html") == 0)) {
  694. res = RSPAMD_MESSAGE_PART_IS_TEXT_HTML;
  695. }
  696. }
  697. /* Skip attachments */
  698. if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT &&
  699. (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
  700. if (!task->cfg->check_text_attachements) {
  701. debug_task("skip attachments for checking as text parts");
  702. return RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
  703. }
  704. }
  705. return res;
  706. }
  707. static gboolean
  708. rspamd_message_process_text_part_maybe(struct rspamd_task *task,
  709. struct rspamd_mime_part *mime_part,
  710. enum rspamd_message_part_is_text_result is_text,
  711. uint16_t *cur_url_order)
  712. {
  713. struct rspamd_mime_text_part *text_part;
  714. unsigned int flags = 0;
  715. enum rspamd_action_type act;
  716. /* Skip attachments */
  717. if ((mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
  718. flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT;
  719. }
  720. text_part = rspamd_mempool_alloc0(task->task_pool,
  721. sizeof(struct rspamd_mime_text_part));
  722. text_part->mime_part = mime_part;
  723. text_part->raw.begin = mime_part->raw_data.begin;
  724. text_part->raw.len = mime_part->raw_data.len;
  725. text_part->parsed.begin = mime_part->parsed_data.begin;
  726. text_part->parsed.len = mime_part->parsed_data.len;
  727. text_part->utf_stripped_text = (UText) UTEXT_INITIALIZER;
  728. text_part->flags |= flags;
  729. if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
  730. if (!rspamd_message_process_html_text_part(task, text_part, cur_url_order)) {
  731. return FALSE;
  732. }
  733. }
  734. else {
  735. if (!rspamd_message_process_plain_text_part(task, text_part)) {
  736. return FALSE;
  737. }
  738. }
  739. g_ptr_array_add(MESSAGE_FIELD(task, text_parts), text_part);
  740. mime_part->part_type = RSPAMD_MIME_PART_TEXT;
  741. mime_part->specific.txt = text_part;
  742. act = rspamd_check_gtube(task, text_part);
  743. if (act != METRIC_ACTION_NOACTION) {
  744. struct rspamd_action *action;
  745. double score = NAN;
  746. action = rspamd_config_get_action_by_type(task->cfg, act);
  747. if (action) {
  748. score = action->threshold;
  749. rspamd_add_passthrough_result(task, action,
  750. RSPAMD_PASSTHROUGH_CRITICAL,
  751. score, "Gtube pattern",
  752. "GTUBE", 0, NULL);
  753. }
  754. rspamd_task_insert_result(task, GTUBE_SYMBOL, 0, NULL);
  755. return TRUE;
  756. }
  757. /* Post process part */
  758. rspamd_normalize_text_part(task, text_part);
  759. if (!IS_TEXT_PART_HTML(text_part)) {
  760. if (mime_part->parent_part) {
  761. struct rspamd_mime_part *parent = mime_part->parent_part;
  762. if (IS_PART_MULTIPART(parent) && parent->specific.mp->children->len == 2) {
  763. /*
  764. * Use strict extraction mode: we will extract missing urls from
  765. * an html part if needed
  766. */
  767. rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
  768. RSPAMD_URL_FIND_STRICT);
  769. }
  770. else {
  771. /*
  772. * Fall back to full text extraction using TLD patterns
  773. */
  774. rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
  775. RSPAMD_URL_FIND_ALL);
  776. }
  777. }
  778. else {
  779. /*
  780. * Fall back to full text extraction using TLD patterns
  781. */
  782. rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
  783. RSPAMD_URL_FIND_ALL);
  784. }
  785. }
  786. else {
  787. rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
  788. RSPAMD_URL_FIND_STRICT);
  789. }
  790. if (text_part->exceptions) {
  791. text_part->exceptions = g_list_sort(text_part->exceptions,
  792. exceptions_compare_func);
  793. rspamd_mempool_add_destructor(task->task_pool,
  794. (rspamd_mempool_destruct_t) g_list_free,
  795. text_part->exceptions);
  796. }
  797. rspamd_mime_part_create_words(task, text_part);
  798. return TRUE;
  799. }
  800. /* Creates message from various data using libmagic to detect type */
  801. static void
  802. rspamd_message_from_data(struct rspamd_task *task, const unsigned char *start,
  803. gsize len)
  804. {
  805. struct rspamd_content_type *ct = NULL;
  806. struct rspamd_mime_part *part;
  807. const char *mb = "application/octet-stream";
  808. char *mid;
  809. rspamd_ftok_t srch, *tok;
  810. char cdbuf[1024];
  811. g_assert(start != NULL);
  812. part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part));
  813. part->raw_data.begin = start;
  814. part->raw_data.len = len;
  815. part->parsed_data.begin = start;
  816. part->parsed_data.len = len;
  817. part->part_number = MESSAGE_FIELD(task, parts)->len;
  818. part->urls = g_ptr_array_new();
  819. part->raw_headers = rspamd_message_headers_new();
  820. part->headers_order = NULL;
  821. tok = rspamd_task_get_request_header(task, "Content-Type");
  822. if (tok) {
  823. /* We have Content-Type defined */
  824. ct = rspamd_content_type_parse(tok->begin, tok->len,
  825. task->task_pool);
  826. part->ct = ct;
  827. }
  828. else if (task->cfg && task->cfg->libs_ctx) {
  829. lua_State *L = task->cfg->lua_state;
  830. if (rspamd_lua_require_function(L,
  831. "lua_magic", "detect_mime_part")) {
  832. struct rspamd_mime_part **pmime;
  833. struct rspamd_task **ptask;
  834. pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
  835. rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
  836. *pmime = part;
  837. ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
  838. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  839. *ptask = task;
  840. if (lua_pcall(L, 2, 2, 0) != 0) {
  841. msg_err_task("cannot detect type: %s", lua_tostring(L, -1));
  842. }
  843. else {
  844. if (lua_istable(L, -1)) {
  845. lua_pushstring(L, "ct");
  846. lua_gettable(L, -2);
  847. if (lua_isstring(L, -1)) {
  848. mb = rspamd_mempool_strdup(task->task_pool,
  849. lua_tostring(L, -1));
  850. }
  851. }
  852. }
  853. lua_settop(L, 0);
  854. }
  855. else {
  856. msg_err_task("cannot require lua_magic.detect_mime_part");
  857. }
  858. if (mb) {
  859. srch.begin = mb;
  860. srch.len = strlen(mb);
  861. ct = rspamd_content_type_parse(srch.begin, srch.len,
  862. task->task_pool);
  863. if (!part->ct) {
  864. msg_info_task("construct fake mime of type: %s", mb);
  865. part->ct = ct;
  866. }
  867. else {
  868. /* Check sanity */
  869. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
  870. RSPAMD_FTOK_FROM_STR(&srch, "application");
  871. if (rspamd_ftok_cmp(&ct->type, &srch) == 0) {
  872. msg_info_task("construct fake mime of type: %s", mb);
  873. part->ct = ct;
  874. }
  875. }
  876. else {
  877. msg_info_task("construct fake mime of type: %T/%T, detected %s",
  878. &part->ct->type, &part->ct->subtype, mb);
  879. }
  880. }
  881. part->detected_ct = ct;
  882. }
  883. }
  884. tok = rspamd_task_get_request_header(task, "Filename");
  885. if (tok) {
  886. rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline; filename=\"%T\"", tok);
  887. }
  888. else {
  889. rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline");
  890. }
  891. part->cd = rspamd_content_disposition_parse(cdbuf, strlen(cdbuf),
  892. task->task_pool);
  893. g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
  894. rspamd_mime_parser_calc_digest(part);
  895. /* Generate message ID */
  896. mid = rspamd_mime_message_id_generate("localhost.localdomain");
  897. rspamd_mempool_add_destructor(task->task_pool,
  898. (rspamd_mempool_destruct_t) g_free, mid);
  899. MESSAGE_FIELD(task, message_id) = mid;
  900. task->queue_id = mid;
  901. }
  902. static void
  903. rspamd_message_dtor(struct rspamd_message *msg)
  904. {
  905. unsigned int i;
  906. struct rspamd_mime_part *p;
  907. struct rspamd_mime_text_part *tp;
  908. PTR_ARRAY_FOREACH(msg->parts, i, p)
  909. {
  910. if (p->raw_headers) {
  911. rspamd_message_headers_unref(p->raw_headers);
  912. }
  913. if (IS_PART_MULTIPART(p)) {
  914. if (p->specific.mp->children) {
  915. g_ptr_array_free(p->specific.mp->children, TRUE);
  916. }
  917. }
  918. if (p->part_type == RSPAMD_MIME_PART_CUSTOM_LUA &&
  919. p->specific.lua_specific.cbref != -1) {
  920. luaL_unref(msg->task->cfg->lua_state,
  921. LUA_REGISTRYINDEX,
  922. p->specific.lua_specific.cbref);
  923. }
  924. if (p->urls) {
  925. g_ptr_array_unref(p->urls);
  926. }
  927. }
  928. PTR_ARRAY_FOREACH(msg->text_parts, i, tp)
  929. {
  930. if (tp->utf_words) {
  931. g_array_free(tp->utf_words, TRUE);
  932. }
  933. if (tp->normalized_hashes) {
  934. g_array_free(tp->normalized_hashes, TRUE);
  935. }
  936. if (tp->languages) {
  937. g_ptr_array_unref(tp->languages);
  938. }
  939. }
  940. rspamd_message_headers_unref(msg->raw_headers);
  941. g_ptr_array_unref(msg->text_parts);
  942. g_ptr_array_unref(msg->parts);
  943. kh_destroy(rspamd_url_hash, msg->urls);
  944. }
  945. struct rspamd_message *
  946. rspamd_message_new(struct rspamd_task *task)
  947. {
  948. struct rspamd_message *msg;
  949. msg = rspamd_mempool_alloc0(task->task_pool, sizeof(*msg));
  950. msg->raw_headers = rspamd_message_headers_new();
  951. msg->urls = kh_init(rspamd_url_hash);
  952. msg->parts = g_ptr_array_sized_new(4);
  953. msg->text_parts = g_ptr_array_sized_new(2);
  954. msg->task = task;
  955. REF_INIT_RETAIN(msg, rspamd_message_dtor);
  956. return msg;
  957. }
  958. gboolean
  959. rspamd_message_parse(struct rspamd_task *task)
  960. {
  961. const char *p;
  962. gsize len;
  963. unsigned int i;
  964. GError *err = NULL;
  965. uint64_t n[2], seed;
  966. if (RSPAMD_TASK_IS_EMPTY(task)) {
  967. /* Don't do anything with empty task */
  968. task->flags |= RSPAMD_TASK_FLAG_SKIP_PROCESS;
  969. return TRUE;
  970. }
  971. p = task->msg.begin;
  972. len = task->msg.len;
  973. /* Skip any space characters to avoid some bad messages to be unparsed */
  974. while (len > 0 && g_ascii_isspace(*p)) {
  975. p++;
  976. len--;
  977. }
  978. /*
  979. * Exim somehow uses mailbox format for messages being scanned:
  980. * From xxx@xxx.com Fri May 13 19:08:48 2016
  981. *
  982. * So we check if a task has this line to avoid possible issues
  983. */
  984. if (len > sizeof("From ") - 1) {
  985. if (memcmp(p, "From ", sizeof("From ") - 1) == 0) {
  986. /* Skip to CRLF */
  987. msg_info_task("mailbox input detected, enable workaround");
  988. p += sizeof("From ") - 1;
  989. len -= sizeof("From ") - 1;
  990. while (len > 0 && *p != '\n') {
  991. p++;
  992. len--;
  993. }
  994. while (len > 0 && g_ascii_isspace(*p)) {
  995. p++;
  996. len--;
  997. }
  998. }
  999. }
  1000. task->msg.begin = p;
  1001. task->msg.len = len;
  1002. /* Cleanup old message */
  1003. if (task->message) {
  1004. rspamd_message_unref(task->message);
  1005. }
  1006. task->message = rspamd_message_new(task);
  1007. if (task->flags & RSPAMD_TASK_FLAG_MIME) {
  1008. enum rspamd_mime_parse_error ret;
  1009. debug_task("construct mime parser from string length %d",
  1010. (int) task->msg.len);
  1011. ret = rspamd_mime_parse_task(task, &err);
  1012. switch (ret) {
  1013. case RSPAMD_MIME_PARSE_FATAL:
  1014. msg_err_task("cannot construct mime from stream: %e", err);
  1015. if (task->cfg && (!task->cfg->allow_raw_input)) {
  1016. msg_err_task("cannot construct mime from stream");
  1017. if (err) {
  1018. task->err = err;
  1019. }
  1020. return FALSE;
  1021. }
  1022. else {
  1023. task->flags &= ~RSPAMD_TASK_FLAG_MIME;
  1024. rspamd_message_from_data(task, p, len);
  1025. }
  1026. break;
  1027. case RSPAMD_MIME_PARSE_NESTING:
  1028. msg_warn_task("cannot construct full mime from stream: %e", err);
  1029. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  1030. break;
  1031. case RSPAMD_MIME_PARSE_OK:
  1032. default:
  1033. break;
  1034. }
  1035. if (err) {
  1036. g_error_free(err);
  1037. }
  1038. }
  1039. else {
  1040. rspamd_message_from_data(task, p, len);
  1041. }
  1042. if (MESSAGE_FIELD(task, message_id) == NULL) {
  1043. MESSAGE_FIELD(task, message_id) = "undef";
  1044. }
  1045. debug_task("found %ud parts in message", MESSAGE_FIELD(task, parts)->len);
  1046. if (task->queue_id == NULL) {
  1047. task->queue_id = "undef";
  1048. }
  1049. rspamd_received_maybe_fix_task(task);
  1050. struct rspamd_mime_part *part;
  1051. /* Blake2b applied to string 'rspamd' */
  1052. static const unsigned char RSPAMD_ALIGNED(32) hash_key[] = {
  1053. 0xef,
  1054. 0x43,
  1055. 0xae,
  1056. 0x80,
  1057. 0xcc,
  1058. 0x8d,
  1059. 0xc3,
  1060. 0x4c,
  1061. 0x6f,
  1062. 0x1b,
  1063. 0xd6,
  1064. 0x18,
  1065. 0x1b,
  1066. 0xae,
  1067. 0x87,
  1068. 0x74,
  1069. 0x0c,
  1070. 0xca,
  1071. 0xf7,
  1072. 0x8e,
  1073. 0x5f,
  1074. 0x2e,
  1075. 0x54,
  1076. 0x32,
  1077. 0xf6,
  1078. 0x79,
  1079. 0xb9,
  1080. 0x27,
  1081. 0x26,
  1082. 0x96,
  1083. 0x20,
  1084. 0x92,
  1085. 0x70,
  1086. 0x07,
  1087. 0x85,
  1088. 0xeb,
  1089. 0x83,
  1090. 0xf7,
  1091. 0x89,
  1092. 0xe0,
  1093. 0xd7,
  1094. 0x32,
  1095. 0x2a,
  1096. 0xd2,
  1097. 0x1a,
  1098. 0x64,
  1099. 0x41,
  1100. 0xef,
  1101. 0x49,
  1102. 0xff,
  1103. 0xc3,
  1104. 0x8c,
  1105. 0x54,
  1106. 0xf9,
  1107. 0x67,
  1108. 0x74,
  1109. 0x30,
  1110. 0x1e,
  1111. 0x70,
  1112. 0x2e,
  1113. 0xb7,
  1114. 0x12,
  1115. 0x09,
  1116. 0xfe,
  1117. };
  1118. memcpy(&seed, hash_key, sizeof(seed));
  1119. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
  1120. {
  1121. n[0] = t1ha2_atonce128(&n[1],
  1122. part->digest, sizeof(part->digest),
  1123. seed);
  1124. seed = n[0] ^ n[1];
  1125. }
  1126. memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n));
  1127. if (MESSAGE_FIELD(task, subject)) {
  1128. p = MESSAGE_FIELD(task, subject);
  1129. len = strlen(p);
  1130. n[0] = t1ha2_atonce128(&n[1],
  1131. p, len,
  1132. seed);
  1133. memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n));
  1134. }
  1135. if (task->queue_id) {
  1136. msg_info_task("loaded message; id: <%s>; queue-id: <%s>; size: %z; "
  1137. "checksum: <%*xs>",
  1138. MESSAGE_FIELD(task, message_id), task->queue_id, task->msg.len,
  1139. (int) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest));
  1140. }
  1141. else {
  1142. msg_info_task("loaded message; id: <%s>; size: %z; "
  1143. "checksum: <%*xs>",
  1144. MESSAGE_FIELD(task, message_id), task->msg.len,
  1145. (int) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest));
  1146. }
  1147. return TRUE;
  1148. }
  1149. /*
  1150. * A helper structure to store text parts positions, if it was C++, I could just use std::pair,
  1151. * but here I have to make it all manually, sigh...
  1152. */
  1153. struct rspamd_mime_part_text_position {
  1154. unsigned pos;
  1155. enum rspamd_message_part_is_text_result res;
  1156. };
  1157. /* Place html parts first during analysis */
  1158. static int
  1159. rspamd_mime_text_part_position_compare_func(const void *v1, const void *v2)
  1160. {
  1161. const struct rspamd_mime_part_text_position *p1 = (const struct rspamd_mime_part_text_position *) v1;
  1162. const struct rspamd_mime_part_text_position *p2 = (const struct rspamd_mime_part_text_position *) v2;
  1163. if (p1->res == p2->res) {
  1164. return (int) p2->pos - (int) p1->pos;
  1165. }
  1166. else {
  1167. if (p1->res == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
  1168. return -1;
  1169. }
  1170. else {
  1171. return 1;
  1172. }
  1173. }
  1174. }
  1175. void rspamd_message_process(struct rspamd_task *task)
  1176. {
  1177. unsigned int i;
  1178. struct rspamd_mime_text_part *p1, *p2;
  1179. double diff, *pdiff;
  1180. unsigned int tw, *ptw, dw;
  1181. struct rspamd_mime_part *part;
  1182. lua_State *L = NULL;
  1183. int magic_func_pos = -1, content_func_pos = -1, old_top = -1, funcs_top = -1;
  1184. if (task->cfg) {
  1185. L = task->cfg->lua_state;
  1186. }
  1187. rspamd_archives_process(task);
  1188. if (L) {
  1189. old_top = lua_gettop(L);
  1190. }
  1191. if (L && rspamd_lua_require_function(L,
  1192. "lua_magic", "detect_mime_part")) {
  1193. magic_func_pos = lua_gettop(L);
  1194. }
  1195. else {
  1196. msg_err_task("cannot require lua_magic.detect_mime_part");
  1197. }
  1198. if (L && rspamd_lua_require_function(L,
  1199. "lua_content", "maybe_process_mime_part")) {
  1200. content_func_pos = lua_gettop(L);
  1201. }
  1202. else {
  1203. msg_err_task("cannot require lua_content.maybe_process_mime_part");
  1204. }
  1205. if (L) {
  1206. funcs_top = lua_gettop(L);
  1207. }
  1208. GArray *detected_text_parts = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_part_text_position), 2);
  1209. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
  1210. {
  1211. if (magic_func_pos != -1 && part->parsed_data.len > 0) {
  1212. struct rspamd_mime_part **pmime;
  1213. struct rspamd_task **ptask;
  1214. lua_pushcfunction(L, &rspamd_lua_traceback);
  1215. int err_idx = lua_gettop(L);
  1216. lua_pushvalue(L, magic_func_pos);
  1217. pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
  1218. rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
  1219. *pmime = part;
  1220. ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
  1221. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  1222. *ptask = task;
  1223. if (lua_pcall(L, 2, 2, err_idx) != 0) {
  1224. msg_err_task("cannot detect type: %s", lua_tostring(L, -1));
  1225. }
  1226. else {
  1227. if (lua_istable(L, -1)) {
  1228. const char *mb;
  1229. /* First returned value */
  1230. part->detected_ext = rspamd_mempool_strdup(task->task_pool,
  1231. lua_tostring(L, -2));
  1232. lua_pushstring(L, "ct");
  1233. lua_gettable(L, -2);
  1234. if (lua_isstring(L, -1)) {
  1235. mb = lua_tostring(L, -1);
  1236. if (mb) {
  1237. rspamd_ftok_t srch;
  1238. srch.begin = mb;
  1239. srch.len = strlen(mb);
  1240. part->detected_ct = rspamd_content_type_parse(srch.begin,
  1241. srch.len,
  1242. task->task_pool);
  1243. }
  1244. }
  1245. lua_pop(L, 1);
  1246. lua_pushstring(L, "type");
  1247. lua_gettable(L, -2);
  1248. if (lua_isstring(L, -1)) {
  1249. part->detected_type = rspamd_mempool_strdup(task->task_pool,
  1250. lua_tostring(L, -1));
  1251. }
  1252. lua_pop(L, 1);
  1253. lua_pushstring(L, "no_text");
  1254. lua_gettable(L, -2);
  1255. if (lua_isboolean(L, -1)) {
  1256. if (!!lua_toboolean(L, -1)) {
  1257. part->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION;
  1258. }
  1259. }
  1260. lua_pop(L, 1);
  1261. }
  1262. }
  1263. lua_settop(L, funcs_top);
  1264. }
  1265. /* Now detect content */
  1266. if (content_func_pos != -1 && part->parsed_data.len > 0 &&
  1267. part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
  1268. struct rspamd_mime_part **pmime;
  1269. struct rspamd_task **ptask;
  1270. lua_pushcfunction(L, &rspamd_lua_traceback);
  1271. int err_idx = lua_gettop(L);
  1272. lua_pushvalue(L, content_func_pos);
  1273. pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
  1274. rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
  1275. *pmime = part;
  1276. ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
  1277. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  1278. *ptask = task;
  1279. if (lua_pcall(L, 2, 0, err_idx) != 0) {
  1280. msg_err_task("cannot detect content: %s", lua_tostring(L, -1));
  1281. }
  1282. lua_settop(L, funcs_top);
  1283. }
  1284. /* Try to detect image before checking for text */
  1285. rspamd_images_process_mime_part_maybe(task, part);
  1286. if (part->part_type == RSPAMD_MIME_PART_UNDEFINED &&
  1287. !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
  1288. enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part);
  1289. if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) {
  1290. struct rspamd_mime_part_text_position p = {
  1291. .pos = i,
  1292. .res = res};
  1293. g_array_append_val(detected_text_parts, p);
  1294. }
  1295. }
  1296. }
  1297. uint16_t cur_url_order = 0;
  1298. g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
  1299. /* One more iteration to process text parts in a more specific order */
  1300. for (i = 0; i < detected_text_parts->len; i++) {
  1301. part = g_ptr_array_index(MESSAGE_FIELD(task, parts),
  1302. g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
  1303. rspamd_message_process_text_part_maybe(task, part,
  1304. g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order);
  1305. }
  1306. g_array_free(detected_text_parts, TRUE);
  1307. if (old_top != -1) {
  1308. lua_settop(L, old_top);
  1309. }
  1310. /* Parse urls inside Subject header */
  1311. if (MESSAGE_FIELD(task, subject)) {
  1312. rspamd_url_find_multiple(task->task_pool, MESSAGE_FIELD(task, subject),
  1313. strlen(MESSAGE_FIELD(task, subject)),
  1314. RSPAMD_URL_FIND_STRICT, NULL,
  1315. rspamd_url_task_subject_callback,
  1316. task);
  1317. }
  1318. /* Calculate average words length and number of short words */
  1319. struct rspamd_mime_text_part *text_part;
  1320. double *var;
  1321. unsigned int total_words = 0;
  1322. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1323. {
  1324. if (!text_part->language) {
  1325. rspamd_mime_part_detect_language(task, text_part);
  1326. }
  1327. rspamd_mime_part_extract_words(task, text_part);
  1328. if (text_part->utf_words) {
  1329. total_words += text_part->nwords;
  1330. }
  1331. }
  1332. /* Calculate distance for 2-parts messages */
  1333. if (i == 2) {
  1334. p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0);
  1335. p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1);
  1336. /* First of all check parent object */
  1337. if (p1->mime_part->parent_part) {
  1338. rspamd_ftok_t srch;
  1339. srch.begin = "alternative";
  1340. srch.len = 11;
  1341. if (rspamd_ftok_cmp(&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
  1342. if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
  1343. p1->normalized_hashes && p2->normalized_hashes) {
  1344. /*
  1345. * We also detect language on one part and propagate it to
  1346. * another one
  1347. */
  1348. struct rspamd_mime_text_part *sel;
  1349. /* Prefer HTML as text part is not displayed normally */
  1350. if (IS_TEXT_PART_HTML(p1)) {
  1351. sel = p1;
  1352. }
  1353. else if (IS_TEXT_PART_HTML(p2)) {
  1354. sel = p2;
  1355. }
  1356. else {
  1357. if (p1->utf_content.len > p2->utf_content.len) {
  1358. sel = p1;
  1359. }
  1360. else {
  1361. sel = p2;
  1362. }
  1363. }
  1364. if (sel->language && sel->language[0]) {
  1365. /* Propagate language */
  1366. if (sel == p1) {
  1367. if (p2->languages) {
  1368. g_ptr_array_unref(p2->languages);
  1369. }
  1370. p2->language = sel->language;
  1371. p2->languages = g_ptr_array_ref(sel->languages);
  1372. }
  1373. else {
  1374. if (p1->languages) {
  1375. g_ptr_array_unref(p1->languages);
  1376. }
  1377. p1->language = sel->language;
  1378. p1->languages = g_ptr_array_ref(sel->languages);
  1379. }
  1380. }
  1381. tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
  1382. if (tw > 0) {
  1383. dw = rspamd_words_levenshtein_distance(task,
  1384. p1->normalized_hashes,
  1385. p2->normalized_hashes);
  1386. diff = dw / (double) tw;
  1387. msg_debug_task(
  1388. "different words: %d, total words: %d, "
  1389. "got diff between parts of %.2f",
  1390. dw, tw,
  1391. diff);
  1392. pdiff = rspamd_mempool_alloc(task->task_pool,
  1393. sizeof(double));
  1394. *pdiff = diff;
  1395. rspamd_mempool_set_variable(task->task_pool,
  1396. "parts_distance",
  1397. pdiff,
  1398. NULL);
  1399. ptw = rspamd_mempool_alloc(task->task_pool,
  1400. sizeof(int));
  1401. *ptw = tw;
  1402. rspamd_mempool_set_variable(task->task_pool,
  1403. "total_words",
  1404. ptw,
  1405. NULL);
  1406. }
  1407. }
  1408. }
  1409. }
  1410. else {
  1411. debug_task(
  1412. "message contains two parts but they are in different multi-parts");
  1413. }
  1414. }
  1415. if (total_words > 0) {
  1416. var = rspamd_mempool_get_variable(task->task_pool,
  1417. RSPAMD_MEMPOOL_AVG_WORDS_LEN);
  1418. if (var) {
  1419. *var /= (double) total_words;
  1420. }
  1421. var = rspamd_mempool_get_variable(task->task_pool,
  1422. RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
  1423. if (var) {
  1424. *var /= (double) total_words;
  1425. }
  1426. }
  1427. rspamd_images_link(task);
  1428. rspamd_tokenize_meta_words(task);
  1429. }
  1430. struct rspamd_message *
  1431. rspamd_message_ref(struct rspamd_message *msg)
  1432. {
  1433. REF_RETAIN(msg);
  1434. return msg;
  1435. }
  1436. void rspamd_message_unref(struct rspamd_message *msg)
  1437. {
  1438. if (msg) {
  1439. REF_RELEASE(msg);
  1440. }
  1441. }
  1442. void rspamd_message_update_digest(struct rspamd_message *msg,
  1443. const void *input, gsize len)
  1444. {
  1445. uint64_t n[2];
  1446. /* Sanity */
  1447. G_STATIC_ASSERT(sizeof(n) == sizeof(msg->digest));
  1448. memcpy(n, msg->digest, sizeof(msg->digest));
  1449. n[0] = t1ha2_atonce128(&n[1], input, len, n[0]);
  1450. memcpy(msg->digest, n, sizeof(msg->digest));
  1451. }