You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

message.c 41KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708
  1. /*
  2. * Copyright (c) 2009-2012, Vsevolod Stakhov
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. *
  13. * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
  14. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  15. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  16. * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
  17. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  18. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  19. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  20. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  21. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  22. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. */
  24. #include "config.h"
  25. #include "util.h"
  26. #include "main.h"
  27. #include "message.h"
  28. #include "cfg_file.h"
  29. #include "html.h"
  30. #include "images.h"
  31. #include "utlist.h"
  32. #include "tokenizers.h"
  33. #include <iconv.h>
  34. #define RECURSION_LIMIT 30
  35. #define UTF8_CHARSET "UTF-8"
  36. GByteArray *
  37. strip_html_tags (struct rspamd_task *task,
  38. rspamd_mempool_t * pool,
  39. struct mime_text_part *part,
  40. GByteArray * src,
  41. gint *stateptr)
  42. {
  43. uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart;
  44. gint br, i = 0, depth = 0, in_q = 0;
  45. gint state = 0;
  46. guint dlen;
  47. GByteArray *buf;
  48. GNode *level_ptr = NULL;
  49. gboolean erase = FALSE, html_decode = FALSE;
  50. if (stateptr)
  51. state = *stateptr;
  52. buf = g_byte_array_sized_new (src->len);
  53. g_byte_array_append (buf, src->data, src->len);
  54. c = *src->data;
  55. lc = '\0';
  56. p = src->data;
  57. rp = buf->data;
  58. end = src->data + src->len;
  59. br = 0;
  60. while (i < (gint)src->len) {
  61. switch (c) {
  62. case '\0':
  63. break;
  64. case '<':
  65. if (g_ascii_isspace (*(p + 1))) {
  66. goto reg_char;
  67. }
  68. if (state == 0) {
  69. lc = '<';
  70. tbegin = p + 1;
  71. state = 1;
  72. }
  73. else if (state == 1) {
  74. /* Opening bracket without closing one */
  75. p--;
  76. while (g_ascii_isspace (*p) && p > src->data) {
  77. p--;
  78. }
  79. p++;
  80. goto unbreak_tag;
  81. }
  82. break;
  83. case '(':
  84. if (state == 2) {
  85. if (lc != '"' && lc != '\'') {
  86. lc = '(';
  87. br++;
  88. }
  89. }
  90. else if (state == 0 && !erase) {
  91. *(rp++) = c;
  92. }
  93. break;
  94. case ')':
  95. if (state == 2) {
  96. if (lc != '"' && lc != '\'') {
  97. lc = ')';
  98. br--;
  99. }
  100. }
  101. else if (state == 0 && !erase) {
  102. *(rp++) = c;
  103. }
  104. break;
  105. case '>':
  106. if (depth) {
  107. depth--;
  108. break;
  109. }
  110. if (in_q) {
  111. break;
  112. }
  113. unbreak_tag:
  114. switch (state) {
  115. case 1: /* HTML/XML */
  116. lc = '>';
  117. in_q = state = 0;
  118. erase = !add_html_node (task,
  119. pool,
  120. part,
  121. tbegin,
  122. p - tbegin,
  123. end - tbegin,
  124. &level_ptr);
  125. break;
  126. case 2: /* PHP */
  127. if (!br && lc != '\"' && *(p - 1) == '?') {
  128. in_q = state = 0;
  129. }
  130. break;
  131. case 3:
  132. in_q = state = 0;
  133. break;
  134. case 4: /* JavaScript/CSS/etc... */
  135. if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
  136. in_q = state = 0;
  137. }
  138. break;
  139. default:
  140. if (!erase) {
  141. *(rp++) = c;
  142. }
  143. break;
  144. }
  145. break;
  146. case '"':
  147. case '\'':
  148. if (state == 2 && *(p - 1) != '\\') {
  149. if (lc == c) {
  150. lc = '\0';
  151. }
  152. else if (lc != '\\') {
  153. lc = c;
  154. }
  155. }
  156. else if (state == 0 && !erase) {
  157. *(rp++) = c;
  158. }
  159. if (state && p != src->data && *(p - 1) != '\\' &&
  160. (!in_q || *p == in_q)) {
  161. if (in_q) {
  162. in_q = 0;
  163. }
  164. else {
  165. in_q = *p;
  166. }
  167. }
  168. break;
  169. case '!':
  170. /* JavaScript & Other HTML scripting languages */
  171. if (state == 1 && *(p - 1) == '<') {
  172. state = 3;
  173. lc = c;
  174. }
  175. else {
  176. if (state == 0 && !erase) {
  177. *(rp++) = c;
  178. }
  179. }
  180. break;
  181. case '-':
  182. if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' &&
  183. *(p - 2) == '!') {
  184. state = 4;
  185. }
  186. else {
  187. goto reg_char;
  188. }
  189. break;
  190. case '&':
  191. /* Decode entitle */
  192. html_decode = TRUE;
  193. estart = rp;
  194. goto reg_char;
  195. break;
  196. case ';':
  197. if (html_decode) {
  198. html_decode = FALSE;
  199. *rp = ';';
  200. if (rp - estart > 0) {
  201. dlen = rp - estart + 1;
  202. decode_entitles (estart, &dlen);
  203. rp = estart + dlen;
  204. }
  205. }
  206. break;
  207. case '?':
  208. if (state == 1 && *(p - 1) == '<') {
  209. br = 0;
  210. state = 2;
  211. break;
  212. }
  213. case 'E':
  214. case 'e':
  215. /* !DOCTYPE exception */
  216. if (state == 3 && p > src->data + 6
  217. && g_ascii_tolower (*(p - 1)) == 'p'
  218. && g_ascii_tolower (*(p - 2)) == 'y'
  219. && g_ascii_tolower (*(p - 3)) == 't' &&
  220. g_ascii_tolower (*(p - 4)) == 'c' &&
  221. g_ascii_tolower (*(p - 5)) == 'o' &&
  222. g_ascii_tolower (*(p - 6)) == 'd') {
  223. state = 1;
  224. break;
  225. }
  226. /* fall-through */
  227. case 'l':
  228. /* swm: If we encounter '<?xml' then we shouldn't be in
  229. * state == 2 (PHP). Switch back to HTML.
  230. */
  231. if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' &&
  232. *(p - 2) == 'x') {
  233. state = 1;
  234. break;
  235. }
  236. /* fall-through */
  237. default:
  238. reg_char:
  239. if (state == 0 && !erase) {
  240. *(rp++) = c;
  241. }
  242. break;
  243. }
  244. i++;
  245. if (i < (gint)src->len) {
  246. c = *(++p);
  247. }
  248. }
  249. if (rp < buf->data + src->len) {
  250. *rp = '\0';
  251. g_byte_array_set_size (buf, rp - buf->data);
  252. }
  253. /* Check tag balancing */
  254. if (level_ptr && level_ptr->data != NULL) {
  255. part->is_balanced = FALSE;
  256. }
  257. if (stateptr) {
  258. *stateptr = state;
  259. }
  260. return buf;
  261. }
  262. static void
  263. parse_qmail_recv (rspamd_mempool_t * pool,
  264. gchar *line,
  265. struct received_header *r)
  266. {
  267. gchar *s, *p, t;
  268. /* We are interested only with received from network headers */
  269. if ((p = strstr (line, "from network")) == NULL) {
  270. r->is_error = 2;
  271. return;
  272. }
  273. p += sizeof ("from network") - 1;
  274. while (g_ascii_isspace (*p) || *p == '[') {
  275. p++;
  276. }
  277. /* format is ip/host */
  278. s = p;
  279. if (*p) {
  280. while (g_ascii_isdigit (*++p) || *p == '.') ;
  281. if (*p != '/') {
  282. r->is_error = 1;
  283. return;
  284. }
  285. else {
  286. *p = '\0';
  287. r->real_ip = rspamd_mempool_strdup (pool, s);
  288. *p = '/';
  289. /* Now try to parse hostname */
  290. s = ++p;
  291. while (g_ascii_isalnum (*p) || *p == '.' || *p == '-' || *p ==
  292. '_') {
  293. p++;
  294. }
  295. t = *p;
  296. *p = '\0';
  297. r->real_hostname = rspamd_mempool_strdup (pool, s);
  298. *p = t;
  299. }
  300. }
  301. }
  302. static void
  303. parse_recv_header (rspamd_mempool_t * pool,
  304. struct raw_header *rh,
  305. struct received_header *r)
  306. {
  307. gchar *p, *s, t, **res = NULL;
  308. gchar *line;
  309. enum {
  310. RSPAMD_RECV_STATE_INIT = 0,
  311. RSPAMD_RECV_STATE_FROM,
  312. RSPAMD_RECV_STATE_IP_BLOCK,
  313. RSPAMD_RECV_STATE_BRACES_BLOCK,
  314. RSPAMD_RECV_STATE_BY_BLOCK,
  315. RSPAMD_RECV_STATE_PARSE_IP,
  316. RSPAMD_RECV_STATE_SKIP_SPACES,
  317. RSPAMD_RECV_STATE_ERROR
  318. } state = RSPAMD_RECV_STATE_INIT,
  319. next_state = RSPAMD_RECV_STATE_INIT;
  320. gboolean is_exim = FALSE;
  321. line = rh->decoded;
  322. if (line == NULL) {
  323. return;
  324. }
  325. g_strstrip (line);
  326. p = line;
  327. s = line;
  328. while (*p) {
  329. switch (state) {
  330. /* Initial state, search for from */
  331. case RSPAMD_RECV_STATE_INIT:
  332. if (*p == 'f' || *p == 'F') {
  333. if (g_ascii_tolower (*++p) == 'r' && g_ascii_tolower (*++p) ==
  334. 'o' && g_ascii_tolower (*++p) == 'm') {
  335. p++;
  336. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  337. next_state = RSPAMD_RECV_STATE_FROM;
  338. }
  339. }
  340. else if (g_ascii_tolower (*p) == 'b' &&
  341. g_ascii_tolower (*(p + 1)) == 'y') {
  342. state = RSPAMD_RECV_STATE_IP_BLOCK;
  343. }
  344. else {
  345. /* This can be qmail header, parse it separately */
  346. parse_qmail_recv (pool, line, r);
  347. return;
  348. }
  349. break;
  350. /* Read hostname */
  351. case RSPAMD_RECV_STATE_FROM:
  352. if (*p == '[') {
  353. /* This should be IP address */
  354. res = &r->from_ip;
  355. state = RSPAMD_RECV_STATE_PARSE_IP;
  356. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  357. s = ++p;
  358. }
  359. else if (g_ascii_isalnum (*p) || *p == '.' || *p == '-' || *p ==
  360. '_') {
  361. p++;
  362. }
  363. else {
  364. t = *p;
  365. *p = '\0';
  366. r->from_hostname = rspamd_mempool_strdup (pool, s);
  367. *p = t;
  368. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  369. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  370. }
  371. break;
  372. /* Try to extract additional info */
  373. case RSPAMD_RECV_STATE_IP_BLOCK:
  374. /* Try to extract ip or () info or by */
  375. if (g_ascii_tolower (*p) == 'b' && g_ascii_tolower (*(p + 1)) ==
  376. 'y') {
  377. p += 2;
  378. /* Skip spaces after by */
  379. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  380. next_state = RSPAMD_RECV_STATE_BY_BLOCK;
  381. }
  382. else if (*p == '(') {
  383. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  384. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  385. p++;
  386. }
  387. else if (*p == '[') {
  388. /* Got ip before '(' so extract it */
  389. s = ++p;
  390. res = &r->from_ip;
  391. state = RSPAMD_RECV_STATE_PARSE_IP;
  392. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  393. }
  394. else {
  395. p++;
  396. }
  397. break;
  398. /* We are in () block. Here can be found real hostname and real ip, this is written by some MTA */
  399. case RSPAMD_RECV_STATE_BRACES_BLOCK:
  400. /* End of block */
  401. if (g_ascii_isalnum (*p) || *p == '.' || *p == '-' ||
  402. *p == '_' || *p == ':') {
  403. p++;
  404. }
  405. else if (*p == '[') {
  406. s = ++p;
  407. state = RSPAMD_RECV_STATE_PARSE_IP;
  408. res = &r->real_ip;
  409. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  410. }
  411. else {
  412. if (p > s) {
  413. /* Got some real hostname */
  414. /* check whether it is helo or p is not space symbol */
  415. if (!g_ascii_isspace (*p) || *(p + 1) != '[') {
  416. /* Exim style ([ip]:port helo=hostname) */
  417. if (*s == ':' && (g_ascii_isspace (*p) || *p == ')')) {
  418. /* Ip ending */
  419. is_exim = TRUE;
  420. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  421. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  422. }
  423. else if (p - s == 4 && memcmp (s, "helo=", 5) == 0) {
  424. p++;
  425. is_exim = TRUE;
  426. if (r->real_hostname == NULL && r->from_hostname !=
  427. NULL) {
  428. r->real_hostname = r->from_hostname;
  429. }
  430. s = p;
  431. while (*p != ')' && !g_ascii_isspace (*p) && *p !=
  432. '\0') {
  433. p++;
  434. }
  435. if (p > s) {
  436. r->from_hostname = rspamd_mempool_alloc (pool,
  437. p - s + 1);
  438. rspamd_strlcpy (r->from_hostname, s, p - s + 1);
  439. }
  440. }
  441. else if (p - s == 4 && memcmp (s, "port=", 5) == 0) {
  442. p++;
  443. is_exim = TRUE;
  444. while (g_ascii_isdigit (*p)) {
  445. p++;
  446. }
  447. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  448. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  449. }
  450. else if (*p == '=' && is_exim) {
  451. /* Just skip unknown pairs */
  452. p++;
  453. while (!g_ascii_isspace (*p) && *p != ')' && *p !=
  454. '\0') {
  455. p++;
  456. }
  457. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  458. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  459. }
  460. else {
  461. /* skip all */
  462. while (*p++ != ')' && *p != '\0') ;
  463. state = RSPAMD_RECV_STATE_IP_BLOCK;
  464. }
  465. }
  466. else {
  467. /* Postfix style (hostname [ip]) */
  468. t = *p;
  469. *p = '\0';
  470. r->real_hostname = rspamd_mempool_strdup (pool, s);
  471. *p = t;
  472. /* Now parse ip */
  473. p += 2;
  474. s = p;
  475. res = &r->real_ip;
  476. state = RSPAMD_RECV_STATE_PARSE_IP;
  477. next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
  478. continue;
  479. }
  480. if (*p == ')') {
  481. p++;
  482. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  483. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  484. }
  485. }
  486. else if (*p == ')') {
  487. p++;
  488. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  489. next_state = RSPAMD_RECV_STATE_IP_BLOCK;
  490. }
  491. else {
  492. r->is_error = 1;
  493. return;
  494. }
  495. }
  496. break;
  497. /* Got by word */
  498. case RSPAMD_RECV_STATE_BY_BLOCK:
  499. /* Here can be only hostname */
  500. if ((g_ascii_isalnum (*p) || *p == '.' || *p == '-'
  501. || *p == '_') && p[1] != '\0') {
  502. p++;
  503. }
  504. else {
  505. /* We got something like hostname */
  506. if (p[1] != '\0') {
  507. t = *p;
  508. *p = '\0';
  509. r->by_hostname = rspamd_mempool_strdup (pool, s);
  510. *p = t;
  511. }
  512. else {
  513. r->by_hostname = rspamd_mempool_strdup (pool, s);
  514. }
  515. /* Now end of parsing */
  516. if (is_exim) {
  517. /* Adjust for exim received */
  518. if (r->real_ip == NULL && r->from_ip != NULL) {
  519. r->real_ip = r->from_ip;
  520. }
  521. else if (r->from_ip == NULL && r->real_ip != NULL) {
  522. r->from_ip = r->real_ip;
  523. if (r->real_hostname == NULL && r->from_hostname !=
  524. NULL) {
  525. r->real_hostname = r->from_hostname;
  526. }
  527. }
  528. }
  529. return;
  530. }
  531. break;
  532. /* Extract ip */
  533. case RSPAMD_RECV_STATE_PARSE_IP:
  534. while (g_ascii_isxdigit (*p) || *p == '.' || *p == ':') {
  535. p++;
  536. }
  537. if (*p != ']') {
  538. /* Not an ip in fact */
  539. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  540. p++;
  541. }
  542. else {
  543. *p = '\0';
  544. *res = rspamd_mempool_strdup (pool, s);
  545. *p = ']';
  546. p++;
  547. state = RSPAMD_RECV_STATE_SKIP_SPACES;
  548. }
  549. break;
  550. /* Skip spaces */
  551. case RSPAMD_RECV_STATE_SKIP_SPACES:
  552. if (!g_ascii_isspace (*p)) {
  553. state = next_state;
  554. s = p;
  555. }
  556. else {
  557. p++;
  558. }
  559. break;
  560. default:
  561. r->is_error = 1;
  562. return;
  563. break;
  564. }
  565. }
  566. r->is_error = 1;
  567. return;
  568. }
  569. static void
  570. append_raw_header (GHashTable *target, struct raw_header *rh)
  571. {
  572. struct raw_header *lp;
  573. rh->next = NULL;
  574. rh->prev = rh;
  575. if ((lp =
  576. g_hash_table_lookup (target, rh->name)) != NULL) {
  577. DL_APPEND (lp, rh);
  578. }
  579. else {
  580. g_hash_table_insert (target, rh->name, rh);
  581. }
  582. debug_task ("add raw header %s: %s", rh->name, rh->value);
  583. }
  584. /* Convert raw headers to a list of struct raw_header * */
  585. static void
  586. process_raw_headers (GHashTable *target, rspamd_mempool_t *pool, const gchar *in)
  587. {
  588. struct raw_header *new = NULL;
  589. const gchar *p, *c;
  590. gchar *tmp, *tp;
  591. gint state = 0, l, next_state = 100, err_state = 100, t_state;
  592. gboolean valid_folding = FALSE;
  593. p = in;
  594. c = p;
  595. while (*p) {
  596. /* FSM for processing headers */
  597. switch (state) {
  598. case 0:
  599. /* Begin processing headers */
  600. if (!g_ascii_isalpha (*p)) {
  601. /* We have some garbage at the beginning of headers, skip this line */
  602. state = 100;
  603. next_state = 0;
  604. }
  605. else {
  606. state = 1;
  607. c = p;
  608. }
  609. break;
  610. case 1:
  611. /* We got something like header's name */
  612. if (*p == ':') {
  613. new =
  614. rspamd_mempool_alloc0 (pool,
  615. sizeof (struct raw_header));
  616. new->prev = new;
  617. l = p - c;
  618. tmp = rspamd_mempool_alloc (pool, l + 1);
  619. rspamd_strlcpy (tmp, c, l + 1);
  620. new->name = tmp;
  621. new->empty_separator = TRUE;
  622. p++;
  623. state = 2;
  624. c = p;
  625. }
  626. else if (g_ascii_isspace (*p)) {
  627. /* Not header but some garbage */
  628. state = 100;
  629. next_state = 0;
  630. }
  631. else {
  632. p++;
  633. }
  634. break;
  635. case 2:
  636. /* We got header's name, so skip any \t or spaces */
  637. if (*p == '\t') {
  638. new->tab_separated = TRUE;
  639. new->empty_separator = FALSE;
  640. p++;
  641. }
  642. else if (*p == ' ') {
  643. new->empty_separator = FALSE;
  644. p++;
  645. }
  646. else if (*p == '\n' || *p == '\r') {
  647. /* Process folding */
  648. state = 99;
  649. l = p - c;
  650. if (l > 0) {
  651. tmp = rspamd_mempool_alloc (pool, l + 1);
  652. rspamd_strlcpy (tmp, c, l + 1);
  653. new->separator = tmp;
  654. }
  655. next_state = 3;
  656. err_state = 5;
  657. c = p;
  658. }
  659. else {
  660. /* Process value */
  661. l = p - c;
  662. if (l >= 0) {
  663. tmp = rspamd_mempool_alloc (pool, l + 1);
  664. rspamd_strlcpy (tmp, c, l + 1);
  665. new->separator = tmp;
  666. }
  667. c = p;
  668. state = 3;
  669. }
  670. break;
  671. case 3:
  672. if (*p == '\r' || *p == '\n') {
  673. /* Hold folding */
  674. state = 99;
  675. next_state = 3;
  676. err_state = 4;
  677. }
  678. else if (*(p + 1) == '\0') {
  679. state = 4;
  680. }
  681. else {
  682. p++;
  683. }
  684. break;
  685. case 4:
  686. /* Copy header's value */
  687. l = p - c;
  688. tmp = rspamd_mempool_alloc (pool, l + 1);
  689. tp = tmp;
  690. t_state = 0;
  691. while (l--) {
  692. if (t_state == 0) {
  693. /* Before folding */
  694. if (*c == '\n' || *c == '\r') {
  695. t_state = 1;
  696. c++;
  697. *tp++ = ' ';
  698. }
  699. else {
  700. *tp++ = *c++;
  701. }
  702. }
  703. else if (t_state == 1) {
  704. /* Inside folding */
  705. if (g_ascii_isspace (*c)) {
  706. c++;
  707. }
  708. else {
  709. t_state = 0;
  710. *tp++ = *c++;
  711. }
  712. }
  713. }
  714. /* Strip last space that can be added by \r\n parsing */
  715. if (*(tp - 1) == ' ') {
  716. tp--;
  717. }
  718. *tp = '\0';
  719. new->value = tmp;
  720. new->decoded = g_mime_utils_header_decode_text (new->value);
  721. rspamd_mempool_add_destructor (pool,
  722. (rspamd_mempool_destruct_t)g_free, new->decoded);
  723. append_raw_header (target, new);
  724. state = 0;
  725. break;
  726. case 5:
  727. /* Header has only name, no value */
  728. new->value = "";
  729. new->decoded = NULL;
  730. append_raw_header (target, new);
  731. state = 0;
  732. break;
  733. case 99:
  734. /* Folding state */
  735. if (*(p + 1) == '\0') {
  736. state = err_state;
  737. }
  738. else {
  739. if (*p == '\r' || *p == '\n') {
  740. p++;
  741. valid_folding = FALSE;
  742. }
  743. else if (*p == '\t' || *p == ' ') {
  744. /* Valid folding */
  745. p++;
  746. valid_folding = TRUE;
  747. }
  748. else {
  749. if (valid_folding) {
  750. debug_task ("go to state: %d->%d", state, next_state);
  751. state = next_state;
  752. }
  753. else {
  754. /* Fall back */
  755. debug_task ("go to state: %d->%d", state, err_state);
  756. state = err_state;
  757. }
  758. }
  759. }
  760. break;
  761. case 100:
  762. /* Fail state, skip line */
  763. if (*p == '\r') {
  764. if (*(p + 1) == '\n') {
  765. p++;
  766. }
  767. p++;
  768. state = next_state;
  769. }
  770. else if (*p == '\n') {
  771. if (*(p + 1) == '\r') {
  772. p++;
  773. }
  774. p++;
  775. state = next_state;
  776. }
  777. else if (*(p + 1) == '\0') {
  778. state = next_state;
  779. p++;
  780. }
  781. else {
  782. p++;
  783. }
  784. break;
  785. }
  786. }
  787. }
  788. static void
  789. free_byte_array_callback (void *pointer)
  790. {
  791. GByteArray *arr = (GByteArray *) pointer;
  792. g_byte_array_free (arr, TRUE);
  793. }
  794. static gboolean
  795. charset_validate (rspamd_mempool_t *pool, const gchar *in, gchar **out)
  796. {
  797. /*
  798. * This is a simple routine to validate input charset
  799. * we just check that charset starts with alphanumeric and ends
  800. * with alphanumeric
  801. */
  802. const gchar *begin, *end;
  803. gboolean changed = FALSE, to_uppercase = FALSE;
  804. begin = in;
  805. while (!g_ascii_isalnum (*begin)) {
  806. begin ++;
  807. changed = TRUE;
  808. }
  809. if (!g_ascii_islower(*begin)) {
  810. changed = TRUE;
  811. to_uppercase = TRUE;
  812. }
  813. end = begin + strlen (begin) - 1;
  814. while (!g_ascii_isalnum (*end)) {
  815. end --;
  816. changed = TRUE;
  817. }
  818. if (!changed) {
  819. *out = (gchar *)in;
  820. }
  821. else {
  822. *out = rspamd_mempool_alloc (pool, end - begin + 2);
  823. if (to_uppercase) {
  824. gchar *o = *out;
  825. while (begin != end + 1) {
  826. if (g_ascii_islower (*begin)) {
  827. *o++ = g_ascii_toupper (*begin ++);
  828. }
  829. else {
  830. *o++ = *begin++;
  831. }
  832. }
  833. *o = '\0';
  834. }
  835. else {
  836. rspamd_strlcpy (*out, begin, end - begin + 2);
  837. }
  838. }
  839. return TRUE;
  840. }
  841. static GQuark
  842. converter_error_quark (void)
  843. {
  844. return g_quark_from_static_string ("conversion error");
  845. }
  846. static gchar *
  847. rspamd_text_to_utf8 (struct rspamd_task *task,
  848. gchar *input, gsize len, const gchar *in_enc,
  849. gsize *olen, GError **err)
  850. {
  851. gchar *res, *s, *d;
  852. gsize outlen;
  853. iconv_t ic;
  854. gsize processed, ret;
  855. ic = iconv_open (UTF8_CHARSET, in_enc);
  856. if (ic == (iconv_t)-1) {
  857. g_set_error (err, converter_error_quark(), EINVAL,
  858. "cannot open iconv for: %s", in_enc);
  859. return NULL;
  860. }
  861. /* For the most of charsets utf8 notation is larger than native one */
  862. outlen = len * 2 + 1;
  863. res = rspamd_mempool_alloc (task->task_pool, outlen);
  864. s = input;
  865. d = res;
  866. processed = outlen - 1;
  867. while (len > 0 && processed > 0) {
  868. ret = iconv (ic, &s, &len, &d, &processed);
  869. if (ret == (gsize)-1) {
  870. switch (errno) {
  871. case E2BIG:
  872. g_set_error (err, converter_error_quark(), EINVAL,
  873. "output of size %zd is not enough to handle "
  874. "converison of %zd bytes", outlen, len);
  875. iconv_close (ic);
  876. return NULL;
  877. case EILSEQ:
  878. case EINVAL:
  879. /* Ignore bad characters */
  880. if (processed > 0 && len > 0) {
  881. *d++ = '?';
  882. s++;
  883. len --;
  884. processed --;
  885. }
  886. break;
  887. }
  888. }
  889. else if (ret == 0) {
  890. break;
  891. }
  892. }
  893. *d = '\0';
  894. *olen = d - res;
  895. iconv_close (ic);
  896. return res;
  897. }
  898. static GByteArray *
  899. convert_text_to_utf (struct rspamd_task *task,
  900. GByteArray * part_content,
  901. GMimeContentType * type,
  902. struct mime_text_part *text_part)
  903. {
  904. GError *err = NULL;
  905. gsize write_bytes;
  906. const gchar *charset;
  907. gchar *res_str, *ocharset;
  908. GByteArray *result_array;
  909. if (task->cfg->raw_mode) {
  910. text_part->is_raw = TRUE;
  911. return part_content;
  912. }
  913. if ((charset =
  914. g_mime_content_type_get_parameter (type, "charset")) == NULL) {
  915. text_part->is_raw = TRUE;
  916. return part_content;
  917. }
  918. if (!charset_validate (task->task_pool, charset, &ocharset)) {
  919. msg_info (
  920. "<%s>: has invalid charset",
  921. task->message_id);
  922. text_part->is_raw = TRUE;
  923. return part_content;
  924. }
  925. if (g_ascii_strcasecmp (ocharset,
  926. "utf-8") == 0 || g_ascii_strcasecmp (ocharset, "utf8") == 0) {
  927. if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
  928. text_part->is_raw = FALSE;
  929. text_part->is_utf = TRUE;
  930. return part_content;
  931. }
  932. else {
  933. msg_info (
  934. "<%s>: contains invalid utf8 characters, assume it as raw",
  935. task->message_id);
  936. text_part->is_raw = TRUE;
  937. return part_content;
  938. }
  939. }
  940. res_str = rspamd_text_to_utf8 (task, part_content->data,
  941. part_content->len,
  942. ocharset,
  943. &write_bytes,
  944. &err);
  945. if (res_str == NULL) {
  946. msg_warn ("<%s>: cannot convert from %s to utf8: %s",
  947. task->message_id,
  948. ocharset,
  949. err ? err->message : "unknown problem");
  950. text_part->is_raw = TRUE;
  951. g_error_free (err);
  952. return part_content;
  953. }
  954. result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
  955. result_array->data = res_str;
  956. result_array->len = write_bytes;
  957. text_part->is_raw = FALSE;
  958. text_part->is_utf = TRUE;
  959. return result_array;
  960. }
  961. struct language_match {
  962. const char *code;
  963. const char *name;
  964. GUnicodeScript script;
  965. };
  966. static int
  967. language_elts_cmp (const void *a, const void *b)
  968. {
  969. GUnicodeScript sc = *(const GUnicodeScript *)a;
  970. const struct language_match *bb = (const struct language_match *)b;
  971. return (sc - bb->script);
  972. }
  973. static void
  974. detect_text_language (struct mime_text_part *part)
  975. {
  976. /* Keep sorted */
  977. static const struct language_match language_codes[] = {
  978. { "", "english", G_UNICODE_SCRIPT_COMMON },
  979. { "", "", G_UNICODE_SCRIPT_INHERITED },
  980. { "ar", "arabic", G_UNICODE_SCRIPT_ARABIC },
  981. { "hy", "armenian", G_UNICODE_SCRIPT_ARMENIAN },
  982. { "bn", "chineese", G_UNICODE_SCRIPT_BENGALI },
  983. { "", "", G_UNICODE_SCRIPT_BOPOMOFO },
  984. { "chr", "", G_UNICODE_SCRIPT_CHEROKEE },
  985. { "cop", "", G_UNICODE_SCRIPT_COPTIC },
  986. { "ru", "russian", G_UNICODE_SCRIPT_CYRILLIC },
  987. /* Deseret was used to write English */
  988. { "", "", G_UNICODE_SCRIPT_DESERET },
  989. { "hi", "", G_UNICODE_SCRIPT_DEVANAGARI },
  990. { "am", "", G_UNICODE_SCRIPT_ETHIOPIC },
  991. { "ka", "", G_UNICODE_SCRIPT_GEORGIAN },
  992. { "", "", G_UNICODE_SCRIPT_GOTHIC },
  993. { "el", "greek", G_UNICODE_SCRIPT_GREEK },
  994. { "gu", "", G_UNICODE_SCRIPT_GUJARATI },
  995. { "pa", "", G_UNICODE_SCRIPT_GURMUKHI },
  996. { "han", "chineese", G_UNICODE_SCRIPT_HAN },
  997. { "ko", "", G_UNICODE_SCRIPT_HANGUL },
  998. { "he", "hebrew", G_UNICODE_SCRIPT_HEBREW },
  999. { "ja", "", G_UNICODE_SCRIPT_HIRAGANA },
  1000. { "kn", "", G_UNICODE_SCRIPT_KANNADA },
  1001. { "ja", "", G_UNICODE_SCRIPT_KATAKANA },
  1002. { "km", "", G_UNICODE_SCRIPT_KHMER },
  1003. { "lo", "", G_UNICODE_SCRIPT_LAO },
  1004. { "en", "english", G_UNICODE_SCRIPT_LATIN },
  1005. { "ml", "", G_UNICODE_SCRIPT_MALAYALAM },
  1006. { "mn", "", G_UNICODE_SCRIPT_MONGOLIAN },
  1007. { "my", "", G_UNICODE_SCRIPT_MYANMAR },
  1008. /* Ogham was used to write old Irish */
  1009. { "", "", G_UNICODE_SCRIPT_OGHAM },
  1010. { "", "", G_UNICODE_SCRIPT_OLD_ITALIC },
  1011. { "or", "", G_UNICODE_SCRIPT_ORIYA },
  1012. { "", "", G_UNICODE_SCRIPT_RUNIC },
  1013. { "si", "", G_UNICODE_SCRIPT_SINHALA },
  1014. { "syr", "", G_UNICODE_SCRIPT_SYRIAC },
  1015. { "ta", "", G_UNICODE_SCRIPT_TAMIL },
  1016. { "te", "", G_UNICODE_SCRIPT_TELUGU },
  1017. { "dv", "", G_UNICODE_SCRIPT_THAANA },
  1018. { "th", "", G_UNICODE_SCRIPT_THAI },
  1019. { "bo", "", G_UNICODE_SCRIPT_TIBETAN },
  1020. { "iu", "", G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL },
  1021. { "", "", G_UNICODE_SCRIPT_YI },
  1022. { "tl", "", G_UNICODE_SCRIPT_TAGALOG },
  1023. /* Phillipino languages/scripts */
  1024. { "hnn", "", G_UNICODE_SCRIPT_HANUNOO },
  1025. { "bku", "", G_UNICODE_SCRIPT_BUHID },
  1026. { "tbw", "", G_UNICODE_SCRIPT_TAGBANWA },
  1027. { "", "", G_UNICODE_SCRIPT_BRAILLE },
  1028. { "", "", G_UNICODE_SCRIPT_CYPRIOT },
  1029. { "", "", G_UNICODE_SCRIPT_LIMBU },
  1030. /* Used for Somali (so) in the past */
  1031. { "", "", G_UNICODE_SCRIPT_OSMANYA },
  1032. /* The Shavian alphabet was designed for English */
  1033. { "", "", G_UNICODE_SCRIPT_SHAVIAN },
  1034. { "", "", G_UNICODE_SCRIPT_LINEAR_B },
  1035. { "", "", G_UNICODE_SCRIPT_TAI_LE },
  1036. { "uga", "", G_UNICODE_SCRIPT_UGARITIC },
  1037. { "", "", G_UNICODE_SCRIPT_NEW_TAI_LUE },
  1038. { "bug", "", G_UNICODE_SCRIPT_BUGINESE },
  1039. { "", "", G_UNICODE_SCRIPT_GLAGOLITIC },
  1040. /* Used for for Berber (ber), but Arabic script is more common */
  1041. { "", "", G_UNICODE_SCRIPT_TIFINAGH },
  1042. { "syl", "", G_UNICODE_SCRIPT_SYLOTI_NAGRI },
  1043. { "peo", "", G_UNICODE_SCRIPT_OLD_PERSIAN },
  1044. { "", "", G_UNICODE_SCRIPT_KHAROSHTHI },
  1045. { "", "", G_UNICODE_SCRIPT_UNKNOWN },
  1046. { "", "", G_UNICODE_SCRIPT_BALINESE },
  1047. { "", "", G_UNICODE_SCRIPT_CUNEIFORM },
  1048. { "", "", G_UNICODE_SCRIPT_PHOENICIAN },
  1049. { "", "", G_UNICODE_SCRIPT_PHAGS_PA },
  1050. { "nqo", "", G_UNICODE_SCRIPT_NKO }
  1051. };
  1052. const struct language_match *lm;
  1053. const int max_chars = 32;
  1054. if (part != NULL) {
  1055. if (part->is_utf) {
  1056. /* Try to detect encoding by several symbols */
  1057. const gchar *p, *pp;
  1058. gunichar c;
  1059. gint32 remain = part->content->len, max = 0, processed = 0;
  1060. gint32 scripts[G_N_ELEMENTS (language_codes)];
  1061. GUnicodeScript scc, sel = G_UNICODE_SCRIPT_COMMON;
  1062. p = part->content->data;
  1063. memset (scripts, 0, sizeof (scripts));
  1064. while (remain > 0 && processed < max_chars) {
  1065. c = g_utf8_get_char_validated (p, remain);
  1066. if (c == (gunichar) -2 || c == (gunichar) -1) {
  1067. break;
  1068. }
  1069. if (g_unichar_isalpha (c)) {
  1070. scc = g_unichar_get_script (c);
  1071. if (scc < (gint)G_N_ELEMENTS (scripts)) {
  1072. scripts[scc]++;
  1073. }
  1074. processed ++;
  1075. }
  1076. pp = g_utf8_next_char (p);
  1077. remain -= pp - p;
  1078. p = pp;
  1079. }
  1080. for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
  1081. if (scripts[remain] > max) {
  1082. max = scripts[remain];
  1083. sel = remain;
  1084. }
  1085. }
  1086. part->script = sel;
  1087. lm = bsearch (&sel, language_codes, G_N_ELEMENTS (language_codes),
  1088. sizeof (language_codes[0]), &language_elts_cmp);
  1089. if (lm != NULL) {
  1090. part->lang_code = lm->code;
  1091. part->language = lm->name;
  1092. }
  1093. }
  1094. }
  1095. }
  1096. static void
  1097. process_text_part (struct rspamd_task *task,
  1098. GByteArray *part_content,
  1099. GMimeContentType *type,
  1100. GMimeObject *part,
  1101. GMimeObject *parent,
  1102. gboolean is_empty)
  1103. {
  1104. struct mime_text_part *text_part;
  1105. const gchar *cd;
  1106. /* Skip attachements */
  1107. #ifndef GMIME24
  1108. cd = g_mime_part_get_content_disposition (GMIME_PART (part));
  1109. if (cd &&
  1110. g_ascii_strcasecmp (cd,
  1111. "attachment") == 0 && !task->cfg->check_text_attachements) {
  1112. debug_task ("skip attachments for checking as text parts");
  1113. return;
  1114. }
  1115. #else
  1116. cd = g_mime_object_get_disposition (GMIME_OBJECT (part));
  1117. if (cd &&
  1118. g_ascii_strcasecmp (cd,
  1119. GMIME_DISPOSITION_ATTACHMENT) == 0 &&
  1120. !task->cfg->check_text_attachements) {
  1121. debug_task ("skip attachments for checking as text parts");
  1122. return;
  1123. }
  1124. #endif
  1125. if (g_mime_content_type_is_type (type, "text",
  1126. "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
  1127. text_part =
  1128. rspamd_mempool_alloc0 (task->task_pool,
  1129. sizeof (struct mime_text_part));
  1130. text_part->is_html = TRUE;
  1131. if (is_empty) {
  1132. text_part->is_empty = TRUE;
  1133. text_part->orig = NULL;
  1134. text_part->content = NULL;
  1135. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1136. return;
  1137. }
  1138. text_part->orig = part_content;
  1139. part_content = convert_text_to_utf (task,
  1140. text_part->orig,
  1141. type,
  1142. text_part);
  1143. text_part->is_balanced = TRUE;
  1144. text_part->html_nodes = NULL;
  1145. text_part->parent = parent;
  1146. text_part->content = strip_html_tags (task,
  1147. task->task_pool,
  1148. text_part,
  1149. part_content,
  1150. NULL);
  1151. if (text_part->html_nodes != NULL) {
  1152. decode_entitles (text_part->content->data,
  1153. &text_part->content->len);
  1154. }
  1155. url_parse_text (task->task_pool, task, text_part, TRUE);
  1156. rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
  1157. rspamd_mempool_add_destructor (task->task_pool,
  1158. (rspamd_mempool_destruct_t) free_byte_array_callback,
  1159. text_part->content);
  1160. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1161. }
  1162. else if (g_mime_content_type_is_type (type, "text", "*")) {
  1163. text_part =
  1164. rspamd_mempool_alloc0 (task->task_pool,
  1165. sizeof (struct mime_text_part));
  1166. text_part->is_html = FALSE;
  1167. text_part->parent = parent;
  1168. if (is_empty) {
  1169. text_part->is_empty = TRUE;
  1170. text_part->orig = NULL;
  1171. text_part->content = NULL;
  1172. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1173. return;
  1174. }
  1175. text_part->content = convert_text_to_utf (task,
  1176. part_content,
  1177. type,
  1178. text_part);
  1179. text_part->orig = part_content;
  1180. url_parse_text (task->task_pool, task, text_part, FALSE);
  1181. rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
  1182. task->text_parts = g_list_prepend (task->text_parts, text_part);
  1183. }
  1184. else {
  1185. return;
  1186. }
  1187. /* Post process part */
  1188. detect_text_language (text_part);
  1189. text_part->words = rspamd_tokenize_text (text_part->content->data,
  1190. text_part->content->len, text_part->is_utf, 4,
  1191. &text_part->urls_offset);
  1192. }
  1193. #ifdef GMIME24
  1194. static void
  1195. mime_foreach_callback (GMimeObject * parent,
  1196. GMimeObject * part,
  1197. gpointer user_data)
  1198. #else
  1199. static void
  1200. mime_foreach_callback (GMimeObject * part, gpointer user_data)
  1201. #endif
  1202. {
  1203. struct rspamd_task *task = (struct rspamd_task *)user_data;
  1204. struct mime_part *mime_part;
  1205. GMimeContentType *type;
  1206. GMimeDataWrapper *wrapper;
  1207. GMimeStream *part_stream;
  1208. GByteArray *part_content;
  1209. task->parts_count++;
  1210. /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
  1211. /* find out what class 'part' is... */
  1212. if (GMIME_IS_MESSAGE_PART (part)) {
  1213. /* message/rfc822 or message/news */
  1214. GMimeMessage *message;
  1215. /* g_mime_message_foreach_part() won't descend into
  1216. child message parts, so if we want to count any
  1217. subparts of this child message, we'll have to call
  1218. g_mime_message_foreach_part() again here. */
  1219. message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
  1220. if (task->parser_recursion++ < RECURSION_LIMIT) {
  1221. #ifdef GMIME24
  1222. g_mime_message_foreach (message, mime_foreach_callback, task);
  1223. #else
  1224. g_mime_message_foreach_part (message, mime_foreach_callback, task);
  1225. #endif
  1226. }
  1227. else {
  1228. msg_err ("endless recursion detected: %d", task->parser_recursion);
  1229. return;
  1230. }
  1231. #ifndef GMIME24
  1232. g_object_unref (message);
  1233. #endif
  1234. }
  1235. else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
  1236. /* message/partial */
  1237. /* this is an incomplete message part, probably a
  1238. large message that the sender has broken into
  1239. smaller parts and is sending us bit by bit. we
  1240. could save some info about it so that we could
  1241. piece this back together again once we get all the
  1242. parts? */
  1243. }
  1244. else if (GMIME_IS_MULTIPART (part)) {
  1245. /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
  1246. task->parser_parent_part = part;
  1247. #ifndef GMIME24
  1248. debug_task ("detected multipart part");
  1249. /* we'll get to finding out if this is a signed/encrypted multipart later... */
  1250. if (task->parser_recursion++ < RECURSION_LIMIT) {
  1251. g_mime_multipart_foreach ((GMimeMultipart *) part,
  1252. mime_foreach_callback,
  1253. task);
  1254. }
  1255. else {
  1256. msg_err ("endless recursion detected: %d", task->parser_recursion);
  1257. return;
  1258. }
  1259. #endif
  1260. }
  1261. else if (GMIME_IS_PART (part)) {
  1262. /* a normal leaf part, could be text/plain or image/jpeg etc */
  1263. #ifdef GMIME24
  1264. type = (GMimeContentType *) g_mime_object_get_content_type (GMIME_OBJECT (
  1265. part));
  1266. #else
  1267. type =
  1268. (GMimeContentType *) g_mime_part_get_content_type (GMIME_PART (part));
  1269. #endif
  1270. if (type == NULL) {
  1271. msg_warn ("type of part is unknown, assume text/plain");
  1272. type = g_mime_content_type_new ("text", "plain");
  1273. #ifdef GMIME24
  1274. rspamd_mempool_add_destructor (task->task_pool,
  1275. (rspamd_mempool_destruct_t) g_object_unref, type);
  1276. #else
  1277. rspamd_mempool_add_destructor (task->task_pool,
  1278. (rspamd_mempool_destruct_t) g_mime_content_type_destroy, type);
  1279. #endif
  1280. }
  1281. wrapper = g_mime_part_get_content_object (GMIME_PART (part));
  1282. #ifdef GMIME24
  1283. if (wrapper != NULL && GMIME_IS_DATA_WRAPPER (wrapper)) {
  1284. #else
  1285. if (wrapper != NULL) {
  1286. #endif
  1287. part_stream = g_mime_stream_mem_new ();
  1288. if (g_mime_data_wrapper_write_to_stream (wrapper,
  1289. part_stream) != -1) {
  1290. gchar *hdrs;
  1291. g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (
  1292. part_stream), FALSE);
  1293. part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (
  1294. part_stream));
  1295. g_object_unref (part_stream);
  1296. mime_part =
  1297. rspamd_mempool_alloc (task->task_pool,
  1298. sizeof (struct mime_part));
  1299. hdrs = g_mime_object_get_headers (GMIME_OBJECT (part));
  1300. mime_part->raw_headers = g_hash_table_new (rspamd_strcase_hash,
  1301. rspamd_strcase_equal);
  1302. rspamd_mempool_add_destructor (task->task_pool,
  1303. (rspamd_mempool_destruct_t) g_hash_table_destroy,
  1304. mime_part->raw_headers);
  1305. if (hdrs != NULL) {
  1306. process_raw_headers (mime_part->raw_headers,
  1307. task->task_pool, hdrs);
  1308. g_free (hdrs);
  1309. }
  1310. mime_part->type = type;
  1311. mime_part->content = part_content;
  1312. mime_part->parent = task->parser_parent_part;
  1313. mime_part->filename = g_mime_part_get_filename (GMIME_PART (
  1314. part));
  1315. debug_task ("found part with content-type: %s/%s",
  1316. type->type,
  1317. type->subtype);
  1318. task->parts = g_list_prepend (task->parts, mime_part);
  1319. /* Skip empty parts */
  1320. process_text_part (task,
  1321. part_content,
  1322. type,
  1323. part,
  1324. task->parser_parent_part,
  1325. (part_content->len <= 0));
  1326. }
  1327. else {
  1328. msg_warn ("write to stream failed: %d, %s", errno,
  1329. strerror (errno));
  1330. }
  1331. #ifndef GMIME24
  1332. g_object_unref (wrapper);
  1333. #endif
  1334. }
  1335. else {
  1336. msg_warn ("cannot get wrapper for mime part, type of part: %s/%s",
  1337. type->type,
  1338. type->subtype);
  1339. }
  1340. }
  1341. else {
  1342. g_assert_not_reached ();
  1343. }
  1344. }
  1345. static void
  1346. destroy_message (void *pointer)
  1347. {
  1348. GMimeMessage *msg = pointer;
  1349. msg_debug ("freeing pointer %p", msg);
  1350. g_object_unref (msg);
  1351. }
  1352. gint
  1353. process_message (struct rspamd_task *task)
  1354. {
  1355. GMimeMessage *message;
  1356. GMimeParser *parser;
  1357. GMimeStream *stream;
  1358. GByteArray *tmp;
  1359. GList *first, *cur;
  1360. GMimePart *part;
  1361. GMimeDataWrapper *wrapper;
  1362. struct received_header *recv;
  1363. gchar *mid, *url_str, *p, *end, *url_end;
  1364. struct uri *subject_url;
  1365. gsize len;
  1366. gint rc;
  1367. tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
  1368. tmp->data = task->msg->str;
  1369. tmp->len = task->msg->len;
  1370. stream = g_mime_stream_mem_new_with_byte_array (tmp);
  1371. /*
  1372. * This causes g_mime_stream not to free memory by itself as it is memory allocated by
  1373. * pool allocator
  1374. */
  1375. g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
  1376. if (task->is_mime) {
  1377. debug_task ("construct mime parser from string length %d",
  1378. (gint)task->msg->len);
  1379. /* create a new parser object to parse the stream */
  1380. parser = g_mime_parser_new_with_stream (stream);
  1381. g_object_unref (stream);
  1382. /* parse the message from the stream */
  1383. message = g_mime_parser_construct_message (parser);
  1384. if (message == NULL) {
  1385. msg_warn ("cannot construct mime from stream");
  1386. return -1;
  1387. }
  1388. task->message = message;
  1389. rspamd_mempool_add_destructor (task->task_pool,
  1390. (rspamd_mempool_destruct_t) destroy_message, task->message);
  1391. /* Save message id for future use */
  1392. task->message_id = g_mime_message_get_message_id (task->message);
  1393. if (task->message_id == NULL) {
  1394. task->message_id = "undef";
  1395. }
  1396. task->parser_recursion = 0;
  1397. #ifdef GMIME24
  1398. g_mime_message_foreach (message, mime_foreach_callback, task);
  1399. #else
  1400. /*
  1401. * This is rather strange, but gmime 2.2 do NOT pass top-level part to foreach callback
  1402. * so we need to set up parent part by hands
  1403. */
  1404. task->parser_parent_part = g_mime_message_get_mime_part (message);
  1405. g_object_unref (task->parser_parent_part);
  1406. g_mime_message_foreach_part (message, mime_foreach_callback, task);
  1407. #endif
  1408. debug_task ("found %d parts in message", task->parts_count);
  1409. if (task->queue_id == NULL) {
  1410. task->queue_id = "undef";
  1411. }
  1412. #ifdef GMIME24
  1413. task->raw_headers_str =
  1414. g_mime_object_get_headers (GMIME_OBJECT (task->message));
  1415. #else
  1416. task->raw_headers_str = g_mime_message_get_headers (task->message);
  1417. #endif
  1418. if (task->raw_headers_str) {
  1419. rspamd_mempool_add_destructor (task->task_pool,
  1420. (rspamd_mempool_destruct_t) g_free, task->raw_headers_str);
  1421. process_raw_headers (task->raw_headers, task->task_pool,
  1422. task->raw_headers_str);
  1423. }
  1424. process_images (task);
  1425. /* Parse received headers */
  1426. first =
  1427. message_get_header (task, "Received", FALSE);
  1428. cur = first;
  1429. while (cur) {
  1430. recv =
  1431. rspamd_mempool_alloc0 (task->task_pool,
  1432. sizeof (struct received_header));
  1433. parse_recv_header (task->task_pool, cur->data, recv);
  1434. task->received = g_list_prepend (task->received, recv);
  1435. cur = g_list_next (cur);
  1436. }
  1437. /* free the parser (and the stream) */
  1438. g_object_unref (parser);
  1439. }
  1440. else {
  1441. /* We got only message, no mime headers or anything like this */
  1442. /* Construct fake message for it */
  1443. message = g_mime_message_new (TRUE);
  1444. task->message = message;
  1445. if (task->from_envelope) {
  1446. g_mime_message_set_sender (task->message,
  1447. rspamd_task_get_sender (task));
  1448. }
  1449. /* Construct part for it */
  1450. part = g_mime_part_new_with_type ("text", "html");
  1451. #ifdef GMIME24
  1452. wrapper = g_mime_data_wrapper_new_with_stream (stream,
  1453. GMIME_CONTENT_ENCODING_8BIT);
  1454. #else
  1455. wrapper = g_mime_data_wrapper_new_with_stream (stream,
  1456. GMIME_PART_ENCODING_8BIT);
  1457. #endif
  1458. g_mime_part_set_content_object (part, wrapper);
  1459. g_mime_message_set_mime_part (task->message, GMIME_OBJECT (part));
  1460. /* Register destructors */
  1461. rspamd_mempool_add_destructor (task->task_pool,
  1462. (rspamd_mempool_destruct_t) g_object_unref, wrapper);
  1463. rspamd_mempool_add_destructor (task->task_pool,
  1464. (rspamd_mempool_destruct_t) g_object_unref, part);
  1465. rspamd_mempool_add_destructor (task->task_pool,
  1466. (rspamd_mempool_destruct_t) destroy_message, task->message);
  1467. /* Now parse in a normal way */
  1468. task->parser_recursion = 0;
  1469. #ifdef GMIME24
  1470. g_mime_message_foreach (task->message, mime_foreach_callback, task);
  1471. #else
  1472. g_mime_message_foreach_part (task->message, mime_foreach_callback,
  1473. task);
  1474. #endif
  1475. /* Generate message ID */
  1476. mid = g_mime_utils_generate_message_id ("localhost.localdomain");
  1477. rspamd_mempool_add_destructor (task->task_pool,
  1478. (rspamd_mempool_destruct_t) g_free, mid);
  1479. g_mime_message_set_message_id (task->message, mid);
  1480. task->message_id = mid;
  1481. task->queue_id = mid;
  1482. /* Set headers for message */
  1483. if (task->subject) {
  1484. g_mime_message_set_subject (task->message, task->subject);
  1485. }
  1486. }
  1487. /* Set mime recipients and sender for the task */
  1488. task->rcpt_mime = g_mime_message_get_all_recipients (message);
  1489. if (task->rcpt_mime) {
  1490. #ifdef GMIME24
  1491. rspamd_mempool_add_destructor (task->task_pool,
  1492. (rspamd_mempool_destruct_t) g_object_unref,
  1493. task->rcpt_mime);
  1494. #else
  1495. rspamd_mempool_add_destructor (task->task_pool,
  1496. (rspamd_mempool_destruct_t) internet_address_list_destroy,
  1497. task->rcpt_mime);
  1498. #endif
  1499. }
  1500. task->from_mime = internet_address_list_parse_string(
  1501. g_mime_message_get_sender (message));
  1502. if (task->from_mime) {
  1503. #ifdef GMIME24
  1504. rspamd_mempool_add_destructor (task->task_pool,
  1505. (rspamd_mempool_destruct_t) g_object_unref,
  1506. task->from_mime);
  1507. #else
  1508. rspamd_mempool_add_destructor (task->task_pool,
  1509. (rspamd_mempool_destruct_t) internet_address_list_destroy,
  1510. task->from_mime);
  1511. #endif
  1512. }
  1513. /* Parse urls inside Subject header */
  1514. cur = message_get_header (task, "Subject", FALSE);
  1515. if (cur) {
  1516. p = cur->data;
  1517. len = strlen (p);
  1518. end = p + len;
  1519. while (p < end) {
  1520. /* Search to the end of url */
  1521. if (url_try_text (task->task_pool, p, end - p, NULL, &url_end,
  1522. &url_str, FALSE)) {
  1523. if (url_str != NULL) {
  1524. subject_url = rspamd_mempool_alloc0 (task->task_pool,
  1525. sizeof (struct uri));
  1526. if (subject_url != NULL) {
  1527. /* Try to parse url */
  1528. rc = parse_uri (subject_url, url_str, task->task_pool);
  1529. if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
  1530. rc == URI_ERRNO_NO_HOST_SLASH) &&
  1531. subject_url->hostlen > 0) {
  1532. if (subject_url->protocol != PROTOCOL_MAILTO) {
  1533. if (!g_tree_lookup (task->urls, subject_url)) {
  1534. g_tree_insert (task->urls,
  1535. subject_url,
  1536. subject_url);
  1537. }
  1538. }
  1539. }
  1540. else if (rc != URI_ERRNO_OK) {
  1541. msg_info ("extract of url '%s' failed: %s",
  1542. url_str,
  1543. url_strerror (rc));
  1544. }
  1545. }
  1546. }
  1547. }
  1548. else {
  1549. break;
  1550. }
  1551. p = url_end + 1;
  1552. }
  1553. }
  1554. return 0;
  1555. }
  1556. GList *
  1557. message_get_header (struct rspamd_task *task,
  1558. const gchar *field,
  1559. gboolean strong)
  1560. {
  1561. GList *gret = NULL;
  1562. struct raw_header *rh;
  1563. rh = g_hash_table_lookup (task->raw_headers, field);
  1564. if (rh == NULL) {
  1565. return NULL;
  1566. }
  1567. while (rh) {
  1568. if (strong) {
  1569. if (strcmp (rh->name, field) == 0) {
  1570. gret = g_list_prepend (gret, rh);
  1571. }
  1572. }
  1573. else {
  1574. gret = g_list_prepend (gret, rh);
  1575. }
  1576. rh = rh->next;
  1577. }
  1578. if (gret != NULL) {
  1579. rspamd_mempool_add_destructor (task->task_pool,
  1580. (rspamd_mempool_destruct_t)g_list_free, gret);
  1581. }
  1582. return gret;
  1583. }