You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

message.c 45KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840
  1. /*
  2. * Copyright (c) 2009-2012, Vsevolod Stakhov
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. *
  13. * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
  14. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  15. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  16. * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
  17. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  18. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  19. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  20. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  21. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  22. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. */
  24. #include "config.h"
  25. #include "util.h"
  26. #include "main.h"
  27. #include "message.h"
  28. #include "cfg_file.h"
  29. #include "html.h"
  30. #include "images.h"
  31. #include "utlist.h"
  32. #include "tokenizers/tokenizers.h"
  33. #include "libstemmer.h"
  34. #include <iconv.h>
  35. #define RECURSION_LIMIT 5
  36. #define UTF8_CHARSET "UTF-8"
  37. #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_PART_FLAG_UTF)
  38. #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_PART_FLAG_UTF)
  39. static GQuark
  40. rspamd_message_quark (void)
  41. {
  42. return g_quark_from_static_string ("mime-error");
  43. }
  44. GByteArray *
  45. strip_html_tags (struct rspamd_task *task,
  46. rspamd_mempool_t * pool,
  47. struct mime_text_part *part,
  48. GByteArray * src,
  49. gint *stateptr)
  50. {
  51. uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart = NULL;
  52. gint br, i = 0, depth = 0, in_q = 0;
  53. gint state = 0;
  54. guint dlen;
  55. GByteArray *buf;
  56. GNode *level_ptr = NULL;
  57. gboolean erase = FALSE, html_decode = FALSE;
  58. if (stateptr)
  59. state = *stateptr;
  60. buf = g_byte_array_sized_new (src->len);
  61. g_byte_array_append (buf, src->data, src->len);
  62. c = *src->data;
  63. lc = '\0';
  64. p = src->data;
  65. rp = buf->data;
  66. end = src->data + src->len;
  67. br = 0;
  68. while (i < (gint)src->len) {
  69. switch (c) {
  70. case '\0':
  71. break;
  72. case '<':
  73. if (g_ascii_isspace (*(p + 1))) {
  74. goto reg_char;
  75. }
  76. if (state == 0) {
  77. lc = '<';
  78. tbegin = p + 1;
  79. state = 1;
  80. }
  81. else if (state == 1) {
  82. /* Opening bracket without closing one */
  83. p--;
  84. while (g_ascii_isspace (*p) && p > src->data) {
  85. p--;
  86. }
  87. p++;
  88. goto unbreak_tag;
  89. }
  90. break;
  91. case '(':
  92. if (state == 2) {
  93. if (lc != '"' && lc != '\'') {
  94. lc = '(';
  95. br++;
  96. }
  97. }
  98. else if (state == 0 && !erase) {
  99. *(rp++) = c;
  100. }
  101. break;
  102. case ')':
  103. if (state == 2) {
  104. if (lc != '"' && lc != '\'') {
  105. lc = ')';
  106. br--;
  107. }
  108. }
  109. else if (state == 0 && !erase) {
  110. *(rp++) = c;
  111. }
  112. break;
  113. case '>':
  114. if (depth) {
  115. depth--;
  116. break;
  117. }
  118. if (in_q) {
  119. break;
  120. }
  121. unbreak_tag:
  122. switch (state) {
  123. case 1: /* HTML/XML */
  124. lc = '>';
  125. in_q = state = 0;
  126. erase = !add_html_node (task,
  127. pool,
  128. part,
  129. tbegin,
  130. p - tbegin,
  131. end - tbegin,
  132. &level_ptr);
  133. break;
  134. case 2: /* PHP */
  135. if (!br && lc != '\"' && *(p - 1) == '?') {
  136. in_q = state = 0;
  137. }
  138. break;
  139. case 3:
  140. in_q = state = 0;
  141. break;
  142. case 4: /* JavaScript/CSS/etc... */
  143. if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
  144. in_q = state = 0;
  145. }
  146. break;
  147. default:
  148. if (!erase) {
  149. *(rp++) = c;
  150. }
  151. break;
  152. }
  153. break;
  154. case '"':
  155. case '\'':
  156. if (state == 2 && *(p - 1) != '\\') {
  157. if (lc == c) {
  158. lc = '\0';
  159. }
  160. else if (lc != '\\') {
  161. lc = c;
  162. }
  163. }
  164. else if (state == 0 && !erase) {
  165. *(rp++) = c;
  166. }
  167. if (state && p != src->data && *(p - 1) != '\\' &&
  168. (!in_q || *p == in_q)) {
  169. if (in_q) {
  170. in_q = 0;
  171. }
  172. else {
  173. in_q = *p;
  174. }
  175. }
  176. break;
  177. case '!':
  178. /* JavaScript & Other HTML scripting languages */
  179. if (state == 1 && *(p - 1) == '<') {
  180. state = 3;
  181. lc = c;
  182. }
  183. else {
  184. if (state == 0 && !erase) {
  185. *(rp++) = c;
  186. }
  187. }
  188. break;
  189. case '-':
  190. if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' &&
  191. *(p - 2) == '!') {
  192. state = 4;
  193. }
  194. else {
  195. goto reg_char;
  196. }
  197. break;
  198. case '&':
  199. /* Decode entitle */
  200. html_decode = TRUE;
  201. estart = rp;
  202. goto reg_char;
  203. break;
  204. case ';':
  205. if (html_decode) {
  206. html_decode = FALSE;
  207. *rp = ';';
  208. if (rp - estart > 0) {
  209. dlen = rp - estart + 1;
  210. decode_entitles (estart, &dlen);
  211. rp = estart + dlen;
  212. }
  213. }
  214. break;
  215. case '?':
  216. if (state == 1 && *(p - 1) == '<') {
  217. br = 0;
  218. state = 2;
  219. break;
  220. }
  221. case 'E':
  222. case 'e':
  223. /* !DOCTYPE exception */
  224. if (state == 3 && p > src->data + 6
  225. && g_ascii_tolower (*(p - 1)) == 'p'
  226. && g_ascii_tolower (*(p - 2)) == 'y'
  227. && g_ascii_tolower (*(p - 3)) == 't' &&
  228. g_ascii_tolower (*(p - 4)) == 'c' &&
  229. g_ascii_tolower (*(p - 5)) == 'o' &&
  230. g_ascii_tolower (*(p - 6)) == 'd') {
  231. state = 1;
  232. break;
  233. }
  234. /* fall-through */
  235. case 'l':
  236. /* swm: If we encounter '<?xml' then we shouldn't be in
  237. * state == 2 (PHP). Switch back to HTML.
  238. */
  239. if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' &&
  240. *(p - 2) == 'x') {
  241. state = 1;
  242. break;
  243. }
  244. /* fall-through */
  245. default:
  246. reg_char:
  247. if (state == 0 && !erase) {
  248. *(rp++) = c;
  249. }
  250. break;
  251. }
  252. i++;
  253. if (i < (gint)src->len) {
  254. c = *(++p);
  255. }
  256. }
  257. if (rp < buf->data + src->len) {
  258. *rp = '\0';
  259. g_byte_array_set_size (buf, rp - buf->data);
  260. }
  261. /* Check tag balancing */
  262. if (level_ptr && level_ptr->data != NULL) {
  263. part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED;
  264. }
  265. else {
  266. part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
  267. }
  268. if (stateptr) {
  269. *stateptr = state;
  270. }
  271. return buf;
  272. }
  273. static void
  274. parse_qmail_recv (rspamd_mempool_t * pool,
  275. gchar *line,
  276. struct received_header *r)
  277. {
  278. gchar *s, *p, t;
  279. /* We are interested only with received from network headers */
  280. if ((p = strstr (line, "from network")) == NULL) {
  281. r->is_error = 2;
  282. return;
  283. }
  284. p += sizeof ("from network") - 1;
  285. while (g_ascii_isspace (*p) || *p == '[') {
  286. p++;
  287. }
  288. /* format is ip/host */
  289. s = p;
  290. if (*p) {
  291. while (g_ascii_isdigit (*++p) || *p == '.') ;
  292. if (*p != '/') {
  293. r->is_error = 1;
  294. return;
  295. }
  296. else {
  297. *p = '\0';
  298. r->real_ip = rspamd_mempool_strdup (pool, s);
  299. *p = '/';
  300. /* Now try to parse hostname */
  301. s = ++p;
  302. while (g_ascii_isalnum (*p) || *p == '.' || *p == '-' || *p ==
  303. '_') {
  304. p++;
  305. }
  306. t = *p;
  307. *p = '\0';
  308. r->real_hostname = rspamd_mempool_strdup (pool, s);
  309. *p = t;
  310. }
  311. }
  312. }
  313. static void
  314. parse_recv_header (rspamd_mempool_t * pool,
  315. struct raw_header *rh,
  316. struct received_header *r)
  317. {
  318. gchar *p, *s, t, **res = NULL;
  319. gchar *line;
  320. enum {
  321. RSPAMD_RECV_STATE_INIT = 0,
  322. RSPAMD_RECV_STATE_FROM,
  323. RSPAMD_RECV_STATE_IP_BLOCK,
  324. RSPAMD_RECV_STATE_BRACES_BLOCK,
  325. RSPAMD_RECV_STATE_BY_BLOCK,
  326. RSPAMD_RECV_STATE_PARSE_IP,
  327. RSPAMD_RECV_STATE_SKIP_SPACES,
  328. RSPAMD_RECV_STATE_ERROR
  329. } state = RSPAMD_RECV_STATE_INIT,
  330. next_state = RSPAMD_RECV_STATE_INIT;
  331. gboolean is_exim = FALSE;
  332. line = rh->decoded;
  333. if (line == NULL) {
  334. return;
  335. }
  336. g_strstrip (line);
  337. p = line;
  338. s = line;
  339. while (*p) {
  340. switch (state) {
  341. /* Initial state, search for from */
  342. case RSPAMD_RECV_STATE_INIT:
  343. if (*p == 'f' || *p == 'F') {
  344. if (g_ascii_tolower (*++p) == 'r' && g_ascii_tolower (*++p) ==
  345. 'o' && g_ascii_tolower (*++p) == 'm') {
  346. p++;
  347. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  348. next_state = RSPAMD_RECV_STATE_FROM;
  349. }
  350. }
  351. else if (g_ascii_tolower (*p) == 'b' &&
  352. g_ascii_tolower (*(p + 1)) == 'y') {
  353. state = RSPAMD_RECV_STATE_IP_BLOCK;
  354. }
  355. else {
  356. /* This can be qmail header, parse it separately */
  357. parse_qmail_recv (pool, line, r);
  358. return;
  359. }
  360. break;
  361. /* Read hostname */
  362. case RSPAMD_RECV_STATE_FROM:
  363. if (*p == '[') {
  364. /* This should be IP address */
  365. res = &r->from_ip;
  366. state = RSPAMD_RECV_STATE_PARSE_IP;
  367. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  368. s = ++p;
  369. }
  370. else if (g_ascii_isalnum (*p) || *p == '.' || *p == '-' || *p ==
  371. '_') {
  372. p++;
  373. }
  374. else {
  375. t = *p;
  376. *p = '\0';
  377. r->from_hostname = rspamd_mempool_strdup (pool, s);
  378. *p = t;
  379. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  380. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  381. }
  382. break;
  383. /* Try to extract additional info */
  384. case RSPAMD_RECV_STATE_IP_BLOCK:
  385. /* Try to extract ip or () info or by */
  386. if (g_ascii_tolower (*p) == 'b' && g_ascii_tolower (*(p + 1)) ==
  387. 'y') {
  388. p += 2;
  389. /* Skip spaces after by */
  390. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  391. next_state = RSPAMD_RECV_STATE_BY_BLOCK;
  392. }
  393. else if (*p == '(') {
  394. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  395. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  396. p++;
  397. }
  398. else if (*p == '[') {
  399. /* Got ip before '(' so extract it */
  400. s = ++p;
  401. res = &r->from_ip;
  402. state = RSPAMD_RECV_STATE_PARSE_IP;
  403. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  404. }
  405. else {
  406. p++;
  407. }
  408. break;
  409. /* We are in () block. Here can be found real hostname and real ip, this is written by some MTA */
  410. case RSPAMD_RECV_STATE_BRACES_BLOCK:
  411. /* End of block */
  412. if (g_ascii_isalnum (*p) || *p == '.' || *p == '-' ||
  413. *p == '_' || *p == ':') {
  414. p++;
  415. }
  416. else if (*p == '[') {
  417. s = ++p;
  418. state = RSPAMD_RECV_STATE_PARSE_IP;
  419. res = &r->real_ip;
  420. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  421. }
  422. else {
  423. if (p > s) {
  424. /* Got some real hostname */
  425. /* check whether it is helo or p is not space symbol */
  426. if (!g_ascii_isspace (*p) || *(p + 1) != '[') {
  427. /* Exim style ([ip]:port helo=hostname) */
  428. if (*s == ':' && (g_ascii_isspace (*p) || *p == ')')) {
  429. /* Ip ending */
  430. is_exim = TRUE;
  431. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  432. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  433. }
  434. else if (p - s == 4 && memcmp (s, "helo=", 5) == 0) {
  435. p++;
  436. is_exim = TRUE;
  437. if (r->real_hostname == NULL && r->from_hostname !=
  438. NULL) {
  439. r->real_hostname = r->from_hostname;
  440. }
  441. s = p;
  442. while (*p != ')' && !g_ascii_isspace (*p) && *p !=
  443. '\0') {
  444. p++;
  445. }
  446. if (p > s) {
  447. r->from_hostname = rspamd_mempool_alloc (pool,
  448. p - s + 1);
  449. rspamd_strlcpy (r->from_hostname, s, p - s + 1);
  450. }
  451. }
  452. else if (p - s == 4 && memcmp (s, "port=", 5) == 0) {
  453. p++;
  454. is_exim = TRUE;
  455. while (g_ascii_isdigit (*p)) {
  456. p++;
  457. }
  458. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  459. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  460. }
  461. else if (*p == '=' && is_exim) {
  462. /* Just skip unknown pairs */
  463. p++;
  464. while (!g_ascii_isspace (*p) && *p != ')' && *p !=
  465. '\0') {
  466. p++;
  467. }
  468. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  469. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  470. }
  471. else {
  472. /* skip all */
  473. while (*p++ != ')' && *p != '\0') ;
  474. state = RSPAMD_RECV_STATE_IP_BLOCK;
  475. }
  476. }
  477. else {
  478. /* Postfix style (hostname [ip]) */
  479. t = *p;
  480. *p = '\0';
  481. r->real_hostname = rspamd_mempool_strdup (pool, s);
  482. *p = t;
  483. /* Now parse ip */
  484. p += 2;
  485. s = p;
  486. res = &r->real_ip;
  487. state = RSPAMD_RECV_STATE_PARSE_IP;
  488. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  489. continue;
  490. }
  491. if (*p == ')') {
  492. p++;
  493. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  494. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  495. }
  496. }
  497. else if (*p == ')') {
  498. p++;
  499. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  500. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  501. }
  502. else {
  503. r->is_error = 1;
  504. return;
  505. }
  506. }
  507. break;
  508. /* Got by word */
  509. case RSPAMD_RECV_STATE_BY_BLOCK:
  510. /* Here can be only hostname */
  511. if ((g_ascii_isalnum (*p) || *p == '.' || *p == '-'
  512. || *p == '_') && p[1] != '\0') {
  513. p++;
  514. }
  515. else {
  516. /* We got something like hostname */
  517. if (p[1] != '\0') {
  518. t = *p;
  519. *p = '\0';
  520. r->by_hostname = rspamd_mempool_strdup (pool, s);
  521. *p = t;
  522. }
  523. else {
  524. r->by_hostname = rspamd_mempool_strdup (pool, s);
  525. }
  526. /* Now end of parsing */
  527. if (is_exim) {
  528. /* Adjust for exim received */
  529. if (r->real_ip == NULL && r->from_ip != NULL) {
  530. r->real_ip = r->from_ip;
  531. }
  532. else if (r->from_ip == NULL && r->real_ip != NULL) {
  533. r->from_ip = r->real_ip;
  534. if (r->real_hostname == NULL && r->from_hostname !=
  535. NULL) {
  536. r->real_hostname = r->from_hostname;
  537. }
  538. }
  539. }
  540. return;
  541. }
  542. break;
  543. /* Extract ip */
  544. case RSPAMD_RECV_STATE_PARSE_IP:
  545. while (g_ascii_isxdigit (*p) || *p == '.' || *p == ':') {
  546. p++;
  547. }
  548. if (*p != ']') {
  549. /* Not an ip in fact */
  550. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  551. p++;
  552. }
  553. else {
  554. *p = '\0';
  555. *res = rspamd_mempool_strdup (pool, s);
  556. *p = ']';
  557. p++;
  558. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  559. }
  560. break;
  561. /* Skip spaces */
  562. case RSPAMD_RECV_STATE_SKIP_SPACES:
  563. if (!g_ascii_isspace (*p)) {
  564. state = next_state;
  565. s = p;
  566. }
  567. else {
  568. p++;
  569. }
  570. break;
  571. default:
  572. r->is_error = 1;
  573. return;
  574. break;
  575. }
  576. }
  577. r->is_error = 1;
  578. return;
  579. }
  580. static void
  581. append_raw_header (GHashTable *target, struct raw_header *rh)
  582. {
  583. struct raw_header *lp;
  584. rh->next = NULL;
  585. rh->prev = rh;
  586. if ((lp =
  587. g_hash_table_lookup (target, rh->name)) != NULL) {
  588. DL_APPEND (lp, rh);
  589. }
  590. else {
  591. g_hash_table_insert (target, rh->name, rh);
  592. }
  593. debug_task ("add raw header %s: %s", rh->name, rh->value);
  594. }
  595. /* Convert raw headers to a list of struct raw_header * */
  596. static void
  597. process_raw_headers (struct rspamd_task *task, GHashTable *target,
  598. const gchar *in, gsize len)
  599. {
  600. struct raw_header *new = NULL;
  601. const gchar *p, *c, *end;
  602. gchar *tmp, *tp;
  603. gint state = 0, l, next_state = 100, err_state = 100, t_state;
  604. gboolean valid_folding = FALSE;
  605. p = in;
  606. end = p + len;
  607. c = p;
  608. while (p < end) {
  609. /* FSM for processing headers */
  610. switch (state) {
  611. case 0:
  612. /* Begin processing headers */
  613. if (!g_ascii_isalpha (*p)) {
  614. /* We have some garbage at the beginning of headers, skip this line */
  615. state = 100;
  616. next_state = 0;
  617. }
  618. else {
  619. state = 1;
  620. c = p;
  621. }
  622. break;
  623. case 1:
  624. /* We got something like header's name */
  625. if (*p == ':') {
  626. new =
  627. rspamd_mempool_alloc0 (task->task_pool,
  628. sizeof (struct raw_header));
  629. new->prev = new;
  630. l = p - c;
  631. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  632. rspamd_strlcpy (tmp, c, l + 1);
  633. new->name = tmp;
  634. new->empty_separator = TRUE;
  635. p++;
  636. state = 2;
  637. c = p;
  638. }
  639. else if (g_ascii_isspace (*p)) {
  640. /* Not header but some garbage */
  641. state = 100;
  642. next_state = 0;
  643. }
  644. else {
  645. p++;
  646. }
  647. break;
  648. case 2:
  649. /* We got header's name, so skip any \t or spaces */
  650. if (*p == '\t') {
  651. new->tab_separated = TRUE;
  652. new->empty_separator = FALSE;
  653. p++;
  654. }
  655. else if (*p == ' ') {
  656. new->empty_separator = FALSE;
  657. p++;
  658. }
  659. else if (*p == '\n' || *p == '\r') {
  660. /* Process folding */
  661. state = 99;
  662. l = p - c;
  663. if (l > 0) {
  664. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  665. rspamd_strlcpy (tmp, c, l + 1);
  666. new->separator = tmp;
  667. }
  668. next_state = 3;
  669. err_state = 5;
  670. c = p;
  671. }
  672. else {
  673. /* Process value */
  674. l = p - c;
  675. if (l >= 0) {
  676. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  677. rspamd_strlcpy (tmp, c, l + 1);
  678. new->separator = tmp;
  679. }
  680. c = p;
  681. state = 3;
  682. }
  683. break;
  684. case 3:
  685. if (*p == '\r' || *p == '\n') {
  686. /* Hold folding */
  687. state = 99;
  688. next_state = 3;
  689. err_state = 4;
  690. }
  691. else if (*(p + 1) == '\0') {
  692. state = 4;
  693. }
  694. else {
  695. p++;
  696. }
  697. break;
  698. case 4:
  699. /* Copy header's value */
  700. l = p - c;
  701. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  702. tp = tmp;
  703. t_state = 0;
  704. while (l--) {
  705. if (t_state == 0) {
  706. /* Before folding */
  707. if (*c == '\n' || *c == '\r') {
  708. t_state = 1;
  709. c++;
  710. *tp++ = ' ';
  711. }
  712. else {
  713. *tp++ = *c++;
  714. }
  715. }
  716. else if (t_state == 1) {
  717. /* Inside folding */
  718. if (g_ascii_isspace (*c)) {
  719. c++;
  720. }
  721. else {
  722. t_state = 0;
  723. *tp++ = *c++;
  724. }
  725. }
  726. }
  727. /* Strip last space that can be added by \r\n parsing */
  728. if (*(tp - 1) == ' ') {
  729. tp--;
  730. }
  731. *tp = '\0';
  732. new->value = tmp;
  733. new->decoded = g_mime_utils_header_decode_text (new->value);
  734. rspamd_mempool_add_destructor (task->task_pool,
  735. (rspamd_mempool_destruct_t)g_free, new->decoded);
  736. append_raw_header (target, new);
  737. state = 0;
  738. break;
  739. case 5:
  740. /* Header has only name, no value */
  741. new->value = "";
  742. new->decoded = NULL;
  743. append_raw_header (target, new);
  744. state = 0;
  745. break;
  746. case 99:
  747. /* Folding state */
  748. if (*(p + 1) == '\0') {
  749. state = err_state;
  750. }
  751. else {
  752. if (*p == '\r' || *p == '\n') {
  753. p++;
  754. valid_folding = FALSE;
  755. }
  756. else if (*p == '\t' || *p == ' ') {
  757. /* Valid folding */
  758. p++;
  759. valid_folding = TRUE;
  760. }
  761. else {
  762. if (valid_folding) {
  763. debug_task ("go to state: %d->%d", state, next_state);
  764. state = next_state;
  765. }
  766. else {
  767. /* Fall back */
  768. debug_task ("go to state: %d->%d", state, err_state);
  769. state = err_state;
  770. }
  771. }
  772. }
  773. break;
  774. case 100:
  775. /* Fail state, skip line */
  776. if (*p == '\r') {
  777. if (*(p + 1) == '\n') {
  778. p++;
  779. }
  780. p++;
  781. state = next_state;
  782. }
  783. else if (*p == '\n') {
  784. if (*(p + 1) == '\r') {
  785. p++;
  786. }
  787. p++;
  788. state = next_state;
  789. }
  790. else if (*(p + 1) == '\0') {
  791. state = next_state;
  792. p++;
  793. }
  794. else {
  795. p++;
  796. }
  797. break;
  798. }
  799. }
  800. }
  801. static void
  802. free_byte_array_callback (void *pointer)
  803. {
  804. GByteArray *arr = (GByteArray *) pointer;
  805. g_byte_array_free (arr, TRUE);
  806. }
  807. static gboolean
  808. charset_validate (rspamd_mempool_t *pool, const gchar *in, gchar **out)
  809. {
  810. /*
  811. * This is a simple routine to validate input charset
  812. * we just check that charset starts with alphanumeric and ends
  813. * with alphanumeric
  814. */
  815. const gchar *begin, *end;
  816. gboolean changed = FALSE, to_uppercase = FALSE;
  817. begin = in;
  818. while (!g_ascii_isalnum (*begin)) {
  819. begin ++;
  820. changed = TRUE;
  821. }
  822. if (!g_ascii_islower(*begin)) {
  823. changed = TRUE;
  824. to_uppercase = TRUE;
  825. }
  826. end = begin + strlen (begin) - 1;
  827. while (!g_ascii_isalnum (*end)) {
  828. end --;
  829. changed = TRUE;
  830. }
  831. if (!changed) {
  832. *out = (gchar *)in;
  833. }
  834. else {
  835. *out = rspamd_mempool_alloc (pool, end - begin + 2);
  836. if (to_uppercase) {
  837. gchar *o = *out;
  838. while (begin != end + 1) {
  839. if (g_ascii_islower (*begin)) {
  840. *o++ = g_ascii_toupper (*begin ++);
  841. }
  842. else {
  843. *o++ = *begin++;
  844. }
  845. }
  846. *o = '\0';
  847. }
  848. else {
  849. rspamd_strlcpy (*out, begin, end - begin + 2);
  850. }
  851. }
  852. return TRUE;
  853. }
  854. static GQuark
  855. converter_error_quark (void)
  856. {
  857. return g_quark_from_static_string ("conversion error");
  858. }
  859. static gchar *
  860. rspamd_text_to_utf8 (struct rspamd_task *task,
  861. gchar *input, gsize len, const gchar *in_enc,
  862. gsize *olen, GError **err)
  863. {
  864. gchar *res, *s, *d;
  865. gsize outlen;
  866. iconv_t ic;
  867. gsize processed, ret;
  868. ic = iconv_open (UTF8_CHARSET, in_enc);
  869. if (ic == (iconv_t)-1) {
  870. g_set_error (err, converter_error_quark(), EINVAL,
  871. "cannot open iconv for: %s", in_enc);
  872. return NULL;
  873. }
  874. /* For the most of charsets utf8 notation is larger than native one */
  875. outlen = len * 2 + 1;
  876. res = rspamd_mempool_alloc (task->task_pool, outlen);
  877. s = input;
  878. d = res;
  879. processed = outlen - 1;
  880. while (len > 0 && processed > 0) {
  881. ret = iconv (ic, &s, &len, &d, &processed);
  882. if (ret == (gsize)-1) {
  883. switch (errno) {
  884. case E2BIG:
  885. g_set_error (err, converter_error_quark(), EINVAL,
  886. "output of size %zd is not enough to handle "
  887. "converison of %zd bytes", outlen, len);
  888. iconv_close (ic);
  889. return NULL;
  890. case EILSEQ:
  891. case EINVAL:
  892. /* Ignore bad characters */
  893. if (processed > 0 && len > 0) {
  894. *d++ = '?';
  895. s++;
  896. len --;
  897. processed --;
  898. }
  899. break;
  900. }
  901. }
  902. else if (ret == 0) {
  903. break;
  904. }
  905. }
  906. *d = '\0';
  907. *olen = d - res;
  908. iconv_close (ic);
  909. return res;
  910. }
  911. static GByteArray *
  912. convert_text_to_utf (struct rspamd_task *task,
  913. GByteArray * part_content,
  914. GMimeContentType * type,
  915. struct mime_text_part *text_part)
  916. {
  917. GError *err = NULL;
  918. gsize write_bytes;
  919. const gchar *charset;
  920. gchar *res_str, *ocharset;
  921. GByteArray *result_array;
  922. if (task->cfg->raw_mode) {
  923. SET_PART_RAW (text_part);
  924. return part_content;
  925. }
  926. if ((charset =
  927. g_mime_content_type_get_parameter (type, "charset")) == NULL) {
  928. SET_PART_RAW (text_part);
  929. return part_content;
  930. }
  931. if (!charset_validate (task->task_pool, charset, &ocharset)) {
  932. msg_info (
  933. "<%s>: has invalid charset",
  934. task->message_id);
  935. SET_PART_RAW (text_part);
  936. return part_content;
  937. }
  938. if (g_ascii_strcasecmp (ocharset,
  939. "utf-8") == 0 || g_ascii_strcasecmp (ocharset, "utf8") == 0) {
  940. if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
  941. SET_PART_UTF (text_part);
  942. return part_content;
  943. }
  944. else {
  945. msg_info (
  946. "<%s>: contains invalid utf8 characters, assume it as raw",
  947. task->message_id);
  948. SET_PART_RAW (text_part);
  949. return part_content;
  950. }
  951. }
  952. else {
  953. res_str = rspamd_text_to_utf8 (task, part_content->data,
  954. part_content->len,
  955. ocharset,
  956. &write_bytes,
  957. &err);
  958. if (res_str == NULL) {
  959. msg_warn ("<%s>: cannot convert from %s to utf8: %s",
  960. task->message_id,
  961. ocharset,
  962. err ? err->message : "unknown problem");
  963. SET_PART_RAW (text_part);
  964. g_error_free (err);
  965. return part_content;
  966. }
  967. }
  968. result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
  969. result_array->data = res_str;
  970. result_array->len = write_bytes;
  971. SET_PART_UTF (text_part);
  972. return result_array;
  973. }
  974. struct language_match {
  975. const char *code;
  976. const char *name;
  977. GUnicodeScript script;
  978. };
  979. static int
  980. language_elts_cmp (const void *a, const void *b)
  981. {
  982. GUnicodeScript sc = *(const GUnicodeScript *)a;
  983. const struct language_match *bb = (const struct language_match *)b;
  984. return (sc - bb->script);
  985. }
  986. static void
  987. detect_text_language (struct mime_text_part *part)
  988. {
  989. /* Keep sorted */
  990. static const struct language_match language_codes[] = {
  991. { "", "english", G_UNICODE_SCRIPT_COMMON },
  992. { "", "", G_UNICODE_SCRIPT_INHERITED },
  993. { "ar", "arabic", G_UNICODE_SCRIPT_ARABIC },
  994. { "hy", "armenian", G_UNICODE_SCRIPT_ARMENIAN },
  995. { "bn", "chineese", G_UNICODE_SCRIPT_BENGALI },
  996. { "", "", G_UNICODE_SCRIPT_BOPOMOFO },
  997. { "chr", "", G_UNICODE_SCRIPT_CHEROKEE },
  998. { "cop", "", G_UNICODE_SCRIPT_COPTIC },
  999. { "ru", "russian", G_UNICODE_SCRIPT_CYRILLIC },
  1000. /* Deseret was used to write English */
  1001. { "", "", G_UNICODE_SCRIPT_DESERET },
  1002. { "hi", "", G_UNICODE_SCRIPT_DEVANAGARI },
  1003. { "am", "", G_UNICODE_SCRIPT_ETHIOPIC },
  1004. { "ka", "", G_UNICODE_SCRIPT_GEORGIAN },
  1005. { "", "", G_UNICODE_SCRIPT_GOTHIC },
  1006. { "el", "greek", G_UNICODE_SCRIPT_GREEK },
  1007. { "gu", "", G_UNICODE_SCRIPT_GUJARATI },
  1008. { "pa", "", G_UNICODE_SCRIPT_GURMUKHI },
  1009. { "han", "chineese", G_UNICODE_SCRIPT_HAN },
  1010. { "ko", "", G_UNICODE_SCRIPT_HANGUL },
  1011. { "he", "hebrew", G_UNICODE_SCRIPT_HEBREW },
  1012. { "ja", "", G_UNICODE_SCRIPT_HIRAGANA },
  1013. { "kn", "", G_UNICODE_SCRIPT_KANNADA },
  1014. { "ja", "", G_UNICODE_SCRIPT_KATAKANA },
  1015. { "km", "", G_UNICODE_SCRIPT_KHMER },
  1016. { "lo", "", G_UNICODE_SCRIPT_LAO },
  1017. { "en", "english", G_UNICODE_SCRIPT_LATIN },
  1018. { "ml", "", G_UNICODE_SCRIPT_MALAYALAM },
  1019. { "mn", "", G_UNICODE_SCRIPT_MONGOLIAN },
  1020. { "my", "", G_UNICODE_SCRIPT_MYANMAR },
  1021. /* Ogham was used to write old Irish */
  1022. { "", "", G_UNICODE_SCRIPT_OGHAM },
  1023. { "", "", G_UNICODE_SCRIPT_OLD_ITALIC },
  1024. { "or", "", G_UNICODE_SCRIPT_ORIYA },
  1025. { "", "", G_UNICODE_SCRIPT_RUNIC },
  1026. { "si", "", G_UNICODE_SCRIPT_SINHALA },
  1027. { "syr", "", G_UNICODE_SCRIPT_SYRIAC },
  1028. { "ta", "", G_UNICODE_SCRIPT_TAMIL },
  1029. { "te", "", G_UNICODE_SCRIPT_TELUGU },
  1030. { "dv", "", G_UNICODE_SCRIPT_THAANA },
  1031. { "th", "", G_UNICODE_SCRIPT_THAI },
  1032. { "bo", "", G_UNICODE_SCRIPT_TIBETAN },
  1033. { "iu", "", G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL },
  1034. { "", "", G_UNICODE_SCRIPT_YI },
  1035. { "tl", "", G_UNICODE_SCRIPT_TAGALOG },
  1036. /* Phillipino languages/scripts */
  1037. { "hnn", "", G_UNICODE_SCRIPT_HANUNOO },
  1038. { "bku", "", G_UNICODE_SCRIPT_BUHID },
  1039. { "tbw", "", G_UNICODE_SCRIPT_TAGBANWA },
  1040. { "", "", G_UNICODE_SCRIPT_BRAILLE },
  1041. { "", "", G_UNICODE_SCRIPT_CYPRIOT },
  1042. { "", "", G_UNICODE_SCRIPT_LIMBU },
  1043. /* Used for Somali (so) in the past */
  1044. { "", "", G_UNICODE_SCRIPT_OSMANYA },
  1045. /* The Shavian alphabet was designed for English */
  1046. { "", "", G_UNICODE_SCRIPT_SHAVIAN },
  1047. { "", "", G_UNICODE_SCRIPT_LINEAR_B },
  1048. { "", "", G_UNICODE_SCRIPT_TAI_LE },
  1049. { "uga", "", G_UNICODE_SCRIPT_UGARITIC },
  1050. { "", "", G_UNICODE_SCRIPT_NEW_TAI_LUE },
  1051. { "bug", "", G_UNICODE_SCRIPT_BUGINESE },
  1052. { "", "", G_UNICODE_SCRIPT_GLAGOLITIC },
  1053. /* Used for for Berber (ber), but Arabic script is more common */
  1054. { "", "", G_UNICODE_SCRIPT_TIFINAGH },
  1055. { "syl", "", G_UNICODE_SCRIPT_SYLOTI_NAGRI },
  1056. { "peo", "", G_UNICODE_SCRIPT_OLD_PERSIAN },
  1057. { "", "", G_UNICODE_SCRIPT_KHAROSHTHI },
  1058. { "", "", G_UNICODE_SCRIPT_UNKNOWN },
  1059. { "", "", G_UNICODE_SCRIPT_BALINESE },
  1060. { "", "", G_UNICODE_SCRIPT_CUNEIFORM },
  1061. { "", "", G_UNICODE_SCRIPT_PHOENICIAN },
  1062. { "", "", G_UNICODE_SCRIPT_PHAGS_PA },
  1063. { "nqo", "", G_UNICODE_SCRIPT_NKO }
  1064. };
  1065. const struct language_match *lm;
  1066. const int max_chars = 32;
  1067. if (part != NULL) {
  1068. if (IS_PART_UTF (part)) {
  1069. /* Try to detect encoding by several symbols */
  1070. const gchar *p, *pp;
  1071. gunichar c;
  1072. gint32 remain = part->content->len, max = 0, processed = 0;
  1073. gint32 scripts[G_N_ELEMENTS (language_codes)];
  1074. GUnicodeScript scc, sel = G_UNICODE_SCRIPT_COMMON;
  1075. p = part->content->data;
  1076. memset (scripts, 0, sizeof (scripts));
  1077. while (remain > 0 && processed < max_chars) {
  1078. c = g_utf8_get_char_validated (p, remain);
  1079. if (c == (gunichar) -2 || c == (gunichar) -1) {
  1080. break;
  1081. }
  1082. if (g_unichar_isalpha (c)) {
  1083. scc = g_unichar_get_script (c);
  1084. if (scc < (gint)G_N_ELEMENTS (scripts)) {
  1085. scripts[scc]++;
  1086. }
  1087. processed ++;
  1088. }
  1089. pp = g_utf8_next_char (p);
  1090. remain -= pp - p;
  1091. p = pp;
  1092. }
  1093. for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
  1094. if (scripts[remain] > max) {
  1095. max = scripts[remain];
  1096. sel = remain;
  1097. }
  1098. }
  1099. part->script = sel;
  1100. lm = bsearch (&sel, language_codes, G_N_ELEMENTS (language_codes),
  1101. sizeof (language_codes[0]), &language_elts_cmp);
  1102. if (lm != NULL) {
  1103. part->lang_code = lm->code;
  1104. part->language = lm->name;
  1105. }
  1106. }
  1107. }
  1108. }
  1109. static void
  1110. rspamd_normalize_text_part (struct rspamd_task *task,
  1111. struct mime_text_part *part)
  1112. {
  1113. struct sb_stemmer *stem = NULL;
  1114. rspamd_fstring_t *w;
  1115. const guchar *r;
  1116. gchar *temp_word;
  1117. guint i, nlen;
  1118. GArray *tmp;
  1119. if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
  1120. stem = sb_stemmer_new (part->language, "UTF_8");
  1121. if (stem == NULL) {
  1122. msg_info ("<%s> cannot create lemmatizer for %s language",
  1123. task->message_id, part->language);
  1124. }
  1125. }
  1126. /* Ugly workaround */
  1127. tmp = rspamd_tokenize_text (part->content->data,
  1128. part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
  1129. part->urls_offset, FALSE);
  1130. if (tmp) {
  1131. for (i = 0; i < tmp->len; i ++) {
  1132. w = &g_array_index (tmp, rspamd_fstring_t, i);
  1133. if (stem) {
  1134. r = sb_stemmer_stem (stem, w->begin, w->len);
  1135. }
  1136. if (w->len > 0 && !(w->len == 6 && memcmp (w->begin, "!!EX!!", 6) == 0)) {
  1137. if (stem != NULL && r != NULL) {
  1138. nlen = strlen (r);
  1139. nlen = MIN (nlen, w->len);
  1140. w->begin = rspamd_mempool_alloc (task->task_pool, nlen);
  1141. memcpy (w->begin, r, nlen);
  1142. w->len = nlen;
  1143. }
  1144. else {
  1145. temp_word = w->begin;
  1146. w->begin = rspamd_mempool_alloc (task->task_pool, w->len);
  1147. memcpy (w->begin, temp_word, w->len);
  1148. if (IS_PART_UTF (part)) {
  1149. rspamd_str_lc_utf8 (w->begin, w->len);
  1150. }
  1151. else {
  1152. rspamd_str_lc (w->begin, w->len);
  1153. }
  1154. }
  1155. }
  1156. }
  1157. part->normalized_words = tmp;
  1158. }
  1159. if (stem != NULL) {
  1160. sb_stemmer_delete (stem);
  1161. }
  1162. }
  1163. static void
  1164. process_text_part (struct rspamd_task *task,
  1165. GByteArray *part_content,
  1166. GMimeContentType *type,
  1167. struct mime_part *mime_part,
  1168. GMimeObject *parent,
  1169. gboolean is_empty)
  1170. {
  1171. struct mime_text_part *text_part;
  1172. const gchar *cd, *p, *c;
  1173. guint remain;
  1174. /* Skip attachements */
  1175. #ifndef GMIME24
  1176. cd = g_mime_part_get_content_disposition (GMIME_PART (mime_part->mime));
  1177. if (cd &&
  1178. g_ascii_strcasecmp (cd,
  1179. "attachment") == 0 && !task->cfg->check_text_attachements) {
  1180. debug_task ("skip attachments for checking as text parts");
  1181. return;
  1182. }
  1183. #else
  1184. cd = g_mime_object_get_disposition (GMIME_OBJECT (mime_part->mime));
  1185. if (cd &&
  1186. g_ascii_strcasecmp (cd,
  1187. GMIME_DISPOSITION_ATTACHMENT) == 0 &&
  1188. !task->cfg->check_text_attachements) {
  1189. debug_task ("skip attachments for checking as text parts");
  1190. return;
  1191. }
  1192. #endif
  1193. if (g_mime_content_type_is_type (type, "text",
  1194. "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
  1195. text_part =
  1196. rspamd_mempool_alloc0 (task->task_pool,
  1197. sizeof (struct mime_text_part));
  1198. text_part->flags |= RSPAMD_MIME_PART_FLAG_HTML;
  1199. if (is_empty) {
  1200. text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
  1201. text_part->orig = NULL;
  1202. text_part->content = NULL;
  1203. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1204. return;
  1205. }
  1206. text_part->orig = part_content;
  1207. part_content = convert_text_to_utf (task,
  1208. text_part->orig,
  1209. type,
  1210. text_part);
  1211. text_part->html_nodes = NULL;
  1212. text_part->parent = parent;
  1213. text_part->mime_part = mime_part;
  1214. text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
  1215. text_part->content = strip_html_tags (task,
  1216. task->task_pool,
  1217. text_part,
  1218. part_content,
  1219. NULL);
  1220. if (text_part->html_nodes != NULL) {
  1221. decode_entitles (text_part->content->data,
  1222. &text_part->content->len);
  1223. }
  1224. rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
  1225. rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
  1226. rspamd_mempool_add_destructor (task->task_pool,
  1227. (rspamd_mempool_destruct_t) free_byte_array_callback,
  1228. text_part->content);
  1229. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1230. }
  1231. else if (g_mime_content_type_is_type (type, "text", "*")) {
  1232. text_part =
  1233. rspamd_mempool_alloc0 (task->task_pool,
  1234. sizeof (struct mime_text_part));
  1235. text_part->parent = parent;
  1236. text_part->mime_part = mime_part;
  1237. if (is_empty) {
  1238. text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
  1239. text_part->orig = NULL;
  1240. text_part->content = NULL;
  1241. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1242. return;
  1243. }
  1244. text_part->content = convert_text_to_utf (task,
  1245. part_content,
  1246. type,
  1247. text_part);
  1248. text_part->orig = part_content;
  1249. rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
  1250. rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
  1251. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1252. }
  1253. else {
  1254. return;
  1255. }
  1256. /* Post process part */
  1257. detect_text_language (text_part);
  1258. text_part->words = rspamd_tokenize_text (text_part->content->data,
  1259. text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
  1260. text_part->urls_offset, FALSE);
  1261. rspamd_normalize_text_part (task, text_part);
  1262. /* Calculate number of lines */
  1263. p = text_part->content->data;
  1264. remain = text_part->content->len;
  1265. c = p;
  1266. while (p != NULL && remain > 0) {
  1267. p = memchr (c, '\n', remain);
  1268. if (p != NULL) {
  1269. text_part->nlines ++;
  1270. remain -= p - c + 1;
  1271. c = p + 1;
  1272. }
  1273. }
  1274. }
  1275. #ifdef GMIME24
  1276. static void
  1277. mime_foreach_callback (GMimeObject * parent,
  1278. GMimeObject * part,
  1279. gpointer user_data)
  1280. #else
  1281. static void
  1282. mime_foreach_callback (GMimeObject * part, gpointer user_data)
  1283. #endif
  1284. {
  1285. struct rspamd_task *task = (struct rspamd_task *)user_data;
  1286. struct mime_part *mime_part;
  1287. GMimeContentType *type;
  1288. GMimeDataWrapper *wrapper;
  1289. GMimeStream *part_stream;
  1290. GByteArray *part_content;
  1291. task->parts_count++;
  1292. /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
  1293. /* find out what class 'part' is... */
  1294. if (GMIME_IS_MESSAGE_PART (part)) {
  1295. /* message/rfc822 or message/news */
  1296. GMimeMessage *message;
  1297. /* g_mime_message_foreach_part() won't descend into
  1298. child message parts, so if we want to count any
  1299. subparts of this child message, we'll have to call
  1300. g_mime_message_foreach_part() again here. */
  1301. message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
  1302. if (task->scan_milliseconds++ < RECURSION_LIMIT) {
  1303. #ifdef GMIME24
  1304. g_mime_message_foreach (message, mime_foreach_callback, task);
  1305. #else
  1306. g_mime_message_foreach_part (message, mime_foreach_callback, task);
  1307. #endif
  1308. }
  1309. else {
  1310. msg_err ("too deep mime recursion detected: %d", task->scan_milliseconds);
  1311. return;
  1312. }
  1313. #ifndef GMIME24
  1314. g_object_unref (message);
  1315. #endif
  1316. }
  1317. else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
  1318. /* message/partial */
  1319. /* this is an incomplete message part, probably a
  1320. large message that the sender has broken into
  1321. smaller parts and is sending us bit by bit. we
  1322. could save some info about it so that we could
  1323. piece this back together again once we get all the
  1324. parts? */
  1325. }
  1326. else if (GMIME_IS_MULTIPART (part)) {
  1327. /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
  1328. task->parser_parent_part = part;
  1329. #ifndef GMIME24
  1330. debug_task ("detected multipart part");
  1331. /* we'll get to finding out if this is a signed/encrypted multipart later... */
  1332. if (task->parser_recursion++ < RECURSION_LIMIT) {
  1333. g_mime_multipart_foreach ((GMimeMultipart *) part,
  1334. mime_foreach_callback,
  1335. task);
  1336. }
  1337. else {
  1338. msg_err ("endless recursion detected: %d", task->parser_recursion);
  1339. return;
  1340. }
  1341. #endif
  1342. }
  1343. else if (GMIME_IS_PART (part)) {
  1344. /* a normal leaf part, could be text/plain or image/jpeg etc */
  1345. #ifdef GMIME24
  1346. type = (GMimeContentType *) g_mime_object_get_content_type (GMIME_OBJECT (
  1347. part));
  1348. #else
  1349. type =
  1350. (GMimeContentType *) g_mime_part_get_content_type (GMIME_PART (part));
  1351. #endif
  1352. if (type == NULL) {
  1353. msg_warn ("type of part is unknown, assume text/plain");
  1354. type = g_mime_content_type_new ("text", "plain");
  1355. #ifdef GMIME24
  1356. rspamd_mempool_add_destructor (task->task_pool,
  1357. (rspamd_mempool_destruct_t) g_object_unref, type);
  1358. #else
  1359. rspamd_mempool_add_destructor (task->task_pool,
  1360. (rspamd_mempool_destruct_t) g_mime_content_type_destroy, type);
  1361. #endif
  1362. }
  1363. wrapper = g_mime_part_get_content_object (GMIME_PART (part));
  1364. #ifdef GMIME24
  1365. if (wrapper != NULL && GMIME_IS_DATA_WRAPPER (wrapper)) {
  1366. #else
  1367. if (wrapper != NULL) {
  1368. #endif
  1369. part_stream = g_mime_stream_mem_new ();
  1370. if (g_mime_data_wrapper_write_to_stream (wrapper,
  1371. part_stream) != -1) {
  1372. gchar *hdrs;
  1373. g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (
  1374. part_stream), FALSE);
  1375. part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (
  1376. part_stream));
  1377. g_object_unref (part_stream);
  1378. mime_part =
  1379. rspamd_mempool_alloc (task->task_pool,
  1380. sizeof (struct mime_part));
  1381. hdrs = g_mime_object_get_headers (GMIME_OBJECT (part));
  1382. mime_part->raw_headers = g_hash_table_new (rspamd_strcase_hash,
  1383. rspamd_strcase_equal);
  1384. rspamd_mempool_add_destructor (task->task_pool,
  1385. (rspamd_mempool_destruct_t) g_hash_table_destroy,
  1386. mime_part->raw_headers);
  1387. if (hdrs != NULL) {
  1388. process_raw_headers (task, mime_part->raw_headers,
  1389. hdrs, strlen (hdrs));
  1390. g_free (hdrs);
  1391. }
  1392. mime_part->type = type;
  1393. mime_part->content = part_content;
  1394. mime_part->parent = task->parser_parent_part;
  1395. mime_part->filename = g_mime_part_get_filename (GMIME_PART (
  1396. part));
  1397. mime_part->mime = part;
  1398. debug_task ("found part with content-type: %s/%s",
  1399. type->type,
  1400. type->subtype);
  1401. task->parts = g_list_prepend (task->parts, mime_part);
  1402. /* Skip empty parts */
  1403. process_text_part (task,
  1404. part_content,
  1405. type,
  1406. mime_part,
  1407. task->parser_parent_part,
  1408. (part_content->len <= 0));
  1409. }
  1410. else {
  1411. msg_warn ("write to stream failed: %d, %s", errno,
  1412. strerror (errno));
  1413. }
  1414. #ifndef GMIME24
  1415. g_object_unref (wrapper);
  1416. #endif
  1417. }
  1418. else {
  1419. msg_warn ("cannot get wrapper for mime part, type of part: %s/%s",
  1420. type->type,
  1421. type->subtype);
  1422. }
  1423. }
  1424. else {
  1425. g_assert_not_reached ();
  1426. }
  1427. }
  1428. static void
  1429. destroy_message (void *pointer)
  1430. {
  1431. GMimeMessage *msg = pointer;
  1432. msg_debug ("freeing pointer %p", msg);
  1433. g_object_unref (msg);
  1434. }
  1435. gboolean
  1436. rspamd_message_parse (struct rspamd_task *task)
  1437. {
  1438. GMimeMessage *message;
  1439. GMimeParser *parser;
  1440. GMimeStream *stream;
  1441. GByteArray *tmp;
  1442. GList *first, *cur;
  1443. GMimePart *part;
  1444. GMimeDataWrapper *wrapper;
  1445. struct received_header *recv;
  1446. gchar *mid, *url_str;
  1447. const gchar *url_end, *p, *end;
  1448. struct rspamd_url *subject_url;
  1449. gsize len;
  1450. gint64 hdr_start, hdr_end;
  1451. gint rc, state = 0;
  1452. tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
  1453. p = task->msg.start;
  1454. len = task->msg.len;
  1455. /* Skip any space characters to avoid some bad messages to be unparsed */
  1456. while (g_ascii_isspace (*p) && len > 0) {
  1457. p ++;
  1458. len --;
  1459. }
  1460. tmp->data = (guint8 *)p;
  1461. tmp->len = len;
  1462. stream = g_mime_stream_mem_new_with_byte_array (tmp);
  1463. /*
  1464. * This causes g_mime_stream not to free memory by itself as it is memory allocated by
  1465. * pool allocator
  1466. */
  1467. g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
  1468. if (task->flags & RSPAMD_TASK_FLAG_MIME) {
  1469. debug_task ("construct mime parser from string length %d",
  1470. (gint)task->msg.len);
  1471. /* create a new parser object to parse the stream */
  1472. parser = g_mime_parser_new_with_stream (stream);
  1473. g_object_unref (stream);
  1474. /* parse the message from the stream */
  1475. message = g_mime_parser_construct_message (parser);
  1476. if (message == NULL) {
  1477. msg_warn ("cannot construct mime from stream");
  1478. g_set_error (&task->err, rspamd_message_quark(), RSPAMD_FILTER_ERROR,\
  1479. "cannot parse MIME in the message");
  1480. /* TODO: backport to 0.9 */
  1481. g_object_unref (parser);
  1482. return FALSE;
  1483. }
  1484. task->message = message;
  1485. rspamd_mempool_add_destructor (task->task_pool,
  1486. (rspamd_mempool_destruct_t) destroy_message, task->message);
  1487. /* Save message id for future use */
  1488. task->message_id = g_mime_message_get_message_id (task->message);
  1489. if (task->message_id == NULL) {
  1490. task->message_id = "undef";
  1491. }
  1492. /*
  1493. * XXX: we use this strange value to save bytes in the task for
  1494. * saving foreach recursion
  1495. */
  1496. task->scan_milliseconds = 0;
  1497. #ifdef GMIME24
  1498. g_mime_message_foreach (message, mime_foreach_callback, task);
  1499. #else
  1500. /*
  1501. * This is rather strange, but gmime 2.2 do NOT pass top-level part to foreach callback
  1502. * so we need to set up parent part by hands
  1503. */
  1504. task->parser_parent_part = g_mime_message_get_mime_part (message);
  1505. g_object_unref (task->parser_parent_part);
  1506. g_mime_message_foreach_part (message, mime_foreach_callback, task);
  1507. #endif
  1508. task->scan_milliseconds = 0;
  1509. debug_task ("found %d parts in message", task->parts_count);
  1510. if (task->queue_id == NULL) {
  1511. task->queue_id = "undef";
  1512. }
  1513. hdr_start = g_mime_parser_get_headers_begin (parser);
  1514. hdr_end = g_mime_parser_get_headers_end (parser);
  1515. if (hdr_start != -1 && hdr_end != -1) {
  1516. g_assert (hdr_start < hdr_end);
  1517. g_assert (hdr_end < (gint64)len);
  1518. task->raw_headers_content.begin = (gchar *)(p + hdr_start);
  1519. task->raw_headers_content.len = (guint64)(hdr_end - hdr_start);
  1520. process_raw_headers (task, task->raw_headers,
  1521. task->raw_headers_content.begin,
  1522. task->raw_headers_content.len);
  1523. }
  1524. process_images (task);
  1525. /* Parse received headers */
  1526. first =
  1527. rspamd_message_get_header (task, "Received", FALSE);
  1528. cur = first;
  1529. while (cur) {
  1530. recv =
  1531. rspamd_mempool_alloc0 (task->task_pool,
  1532. sizeof (struct received_header));
  1533. parse_recv_header (task->task_pool, cur->data, recv);
  1534. task->received = g_list_prepend (task->received, recv);
  1535. cur = g_list_next (cur);
  1536. }
  1537. /* Extract data from received header if we were not given IP */
  1538. if (task->received && (task->flags & RSPAMD_TASK_FLAG_NO_IP)) {
  1539. recv = task->received->data;
  1540. if (recv->real_ip) {
  1541. if (!rspamd_parse_inet_address (&task->from_addr, recv->real_ip)) {
  1542. msg_warn ("cannot get IP from received header: '%s'",
  1543. recv->real_ip);
  1544. task->from_addr = NULL;
  1545. }
  1546. }
  1547. if (recv->real_hostname) {
  1548. task->hostname = recv->real_hostname;
  1549. }
  1550. }
  1551. /* free the parser (and the stream) */
  1552. g_object_unref (parser);
  1553. }
  1554. else {
  1555. /* We got only message, no mime headers or anything like this */
  1556. /* Construct fake message for it */
  1557. message = g_mime_message_new (TRUE);
  1558. task->message = message;
  1559. if (task->from_envelope) {
  1560. g_mime_message_set_sender (task->message,
  1561. rspamd_task_get_sender (task));
  1562. }
  1563. /* Construct part for it */
  1564. part = g_mime_part_new_with_type ("text", "html");
  1565. #ifdef GMIME24
  1566. wrapper = g_mime_data_wrapper_new_with_stream (stream,
  1567. GMIME_CONTENT_ENCODING_8BIT);
  1568. #else
  1569. wrapper = g_mime_data_wrapper_new_with_stream (stream,
  1570. GMIME_PART_ENCODING_8BIT);
  1571. #endif
  1572. g_mime_part_set_content_object (part, wrapper);
  1573. g_mime_message_set_mime_part (task->message, GMIME_OBJECT (part));
  1574. /* Register destructors */
  1575. rspamd_mempool_add_destructor (task->task_pool,
  1576. (rspamd_mempool_destruct_t) g_object_unref, wrapper);
  1577. rspamd_mempool_add_destructor (task->task_pool,
  1578. (rspamd_mempool_destruct_t) g_object_unref, part);
  1579. rspamd_mempool_add_destructor (task->task_pool,
  1580. (rspamd_mempool_destruct_t) destroy_message, task->message);
  1581. /*
  1582. * XXX: we use this strange value to save bytes in the task for
  1583. * saving foreach recursion
  1584. */
  1585. task->scan_milliseconds = 0;
  1586. #ifdef GMIME24
  1587. g_mime_message_foreach (task->message, mime_foreach_callback, task);
  1588. #else
  1589. g_mime_message_foreach_part (task->message, mime_foreach_callback,
  1590. task);
  1591. #endif
  1592. task->scan_milliseconds = 0;
  1593. /* Generate message ID */
  1594. mid = g_mime_utils_generate_message_id ("localhost.localdomain");
  1595. rspamd_mempool_add_destructor (task->task_pool,
  1596. (rspamd_mempool_destruct_t) g_free, mid);
  1597. g_mime_message_set_message_id (task->message, mid);
  1598. task->message_id = mid;
  1599. task->queue_id = mid;
  1600. /* Set headers for message */
  1601. if (task->subject) {
  1602. g_mime_message_set_subject (task->message, task->subject);
  1603. }
  1604. }
  1605. /* Set mime recipients and sender for the task */
  1606. task->rcpt_mime = g_mime_message_get_all_recipients (message);
  1607. if (task->rcpt_mime) {
  1608. #ifdef GMIME24
  1609. rspamd_mempool_add_destructor (task->task_pool,
  1610. (rspamd_mempool_destruct_t) g_object_unref,
  1611. task->rcpt_mime);
  1612. #else
  1613. rspamd_mempool_add_destructor (task->task_pool,
  1614. (rspamd_mempool_destruct_t) internet_address_list_destroy,
  1615. task->rcpt_mime);
  1616. #endif
  1617. }
  1618. task->from_mime = internet_address_list_parse_string(
  1619. g_mime_message_get_sender (message));
  1620. if (task->from_mime) {
  1621. #ifdef GMIME24
  1622. rspamd_mempool_add_destructor (task->task_pool,
  1623. (rspamd_mempool_destruct_t) g_object_unref,
  1624. task->from_mime);
  1625. #else
  1626. rspamd_mempool_add_destructor (task->task_pool,
  1627. (rspamd_mempool_destruct_t) internet_address_list_destroy,
  1628. task->from_mime);
  1629. #endif
  1630. }
  1631. /* Parse urls inside Subject header */
  1632. cur = rspamd_message_get_header (task, "Subject", FALSE);
  1633. if (cur) {
  1634. p = cur->data;
  1635. len = strlen (p);
  1636. end = p + len;
  1637. while (p < end) {
  1638. /* Search to the end of url */
  1639. if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
  1640. &url_str, FALSE, &state)) {
  1641. if (url_str != NULL) {
  1642. subject_url = rspamd_mempool_alloc0 (task->task_pool,
  1643. sizeof (struct rspamd_url));
  1644. rc = rspamd_url_parse (subject_url, url_str,
  1645. strlen (url_str), task->task_pool);
  1646. if ((rc == URI_ERRNO_OK) && subject_url->hostlen > 0) {
  1647. if (subject_url->protocol != PROTOCOL_MAILTO) {
  1648. if (!g_hash_table_lookup (task->urls, subject_url)) {
  1649. g_hash_table_insert (task->urls,
  1650. subject_url,
  1651. subject_url);
  1652. }
  1653. }
  1654. }
  1655. else if (rc != URI_ERRNO_OK) {
  1656. msg_info ("extract of url '%s' failed: %s",
  1657. url_str,
  1658. rspamd_url_strerror (rc));
  1659. }
  1660. }
  1661. }
  1662. else {
  1663. break;
  1664. }
  1665. p = url_end + 1;
  1666. }
  1667. }
  1668. return TRUE;
  1669. }
  1670. GList *
  1671. rspamd_message_get_header (struct rspamd_task *task,
  1672. const gchar *field,
  1673. gboolean strong)
  1674. {
  1675. GList *gret = NULL;
  1676. struct raw_header *rh;
  1677. rh = g_hash_table_lookup (task->raw_headers, field);
  1678. if (rh == NULL) {
  1679. return NULL;
  1680. }
  1681. while (rh) {
  1682. if (strong) {
  1683. if (strcmp (rh->name, field) == 0) {
  1684. gret = g_list_prepend (gret, rh);
  1685. }
  1686. }
  1687. else {
  1688. gret = g_list_prepend (gret, rh);
  1689. }
  1690. rh = rh->next;
  1691. }
  1692. if (gret != NULL) {
  1693. rspamd_mempool_add_destructor (task->task_pool,
  1694. (rspamd_mempool_destruct_t)g_list_free, gret);
  1695. }
  1696. return gret;
  1697. }