You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

mime_headers.c 30KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "mime_headers.h"
  17. #include "smtp_parsers.h"
  18. #include "mime_encoding.h"
  19. #include "received.h"
  20. #include "contrib/uthash/utlist.h"
  21. #include "libserver/mempool_vars_internal.h"
  22. #include "libserver/cfg_file.h"
  23. #include "libutil/util.h"
  24. #include <unicode/utf8.h>
  25. KHASH_INIT (rspamd_mime_headers_htb, gchar *,
  26. struct rspamd_mime_header *, 1,
  27. rspamd_strcase_hash, rspamd_strcase_equal);
  28. struct rspamd_mime_headers_table {
  29. khash_t(rspamd_mime_headers_htb) htb;
  30. ref_entry_t ref;
  31. };
  32. static void
  33. rspamd_mime_header_check_special (struct rspamd_task *task,
  34. struct rspamd_mime_header *rh)
  35. {
  36. guint64 h;
  37. const gchar *p, *end;
  38. gchar *id;
  39. gint max_recipients = -1, len;
  40. if (task->cfg) {
  41. max_recipients = task->cfg->max_recipients;
  42. }
  43. h = rspamd_icase_hash (rh->name, strlen (rh->name), 0xdeadbabe);
  44. switch (h) {
  45. case 0x88705DC4D9D61ABULL: /* received */
  46. if (rspamd_received_header_parse(task, rh->decoded, strlen (rh->decoded), rh)) {
  47. rh->flags |= RSPAMD_HEADER_RECEIVED;
  48. }
  49. break;
  50. case 0x76F31A09F4352521ULL: /* to */
  51. MESSAGE_FIELD (task, rcpt_mime) = rspamd_email_address_from_mime (task->task_pool,
  52. rh->value, strlen (rh->value),
  53. MESSAGE_FIELD (task, rcpt_mime), max_recipients);
  54. rh->flags |= RSPAMD_HEADER_TO|RSPAMD_HEADER_RCPT|RSPAMD_HEADER_UNIQUE;
  55. break;
  56. case 0x7EB117C1480B76ULL: /* cc */
  57. MESSAGE_FIELD (task, rcpt_mime) = rspamd_email_address_from_mime (task->task_pool,
  58. rh->value, strlen (rh->value),
  59. MESSAGE_FIELD (task, rcpt_mime), max_recipients);
  60. rh->flags |= RSPAMD_HEADER_CC|RSPAMD_HEADER_RCPT|RSPAMD_HEADER_UNIQUE;
  61. break;
  62. case 0xE4923E11C4989C8DULL: /* bcc */
  63. MESSAGE_FIELD (task, rcpt_mime) = rspamd_email_address_from_mime (task->task_pool,
  64. rh->value, strlen (rh->value),
  65. MESSAGE_FIELD (task, rcpt_mime), max_recipients);
  66. rh->flags |= RSPAMD_HEADER_BCC|RSPAMD_HEADER_RCPT|RSPAMD_HEADER_UNIQUE;
  67. break;
  68. case 0x41E1985EDC1CBDE4ULL: /* from */
  69. MESSAGE_FIELD (task, from_mime) = rspamd_email_address_from_mime (task->task_pool,
  70. rh->value, strlen (rh->value),
  71. MESSAGE_FIELD (task, from_mime), max_recipients);
  72. rh->flags |= RSPAMD_HEADER_FROM|RSPAMD_HEADER_SENDER|RSPAMD_HEADER_UNIQUE;
  73. break;
  74. case 0x43A558FC7C240226ULL: /* message-id */ {
  75. rh->flags = RSPAMD_HEADER_MESSAGE_ID|RSPAMD_HEADER_UNIQUE;
  76. p = rh->decoded;
  77. len = rspamd_strip_smtp_comments_inplace(rh->decoded, strlen(p));
  78. rh->decoded[len] = '\0'; /* Zero terminate after stripping */
  79. end = p + len;
  80. if (*p == '<') {
  81. p++;
  82. }
  83. if (end > p) {
  84. gchar *d;
  85. if (*(end - 1) == '>') {
  86. end --;
  87. }
  88. id = rspamd_mempool_alloc (task->task_pool, end - p + 1);
  89. d = id;
  90. while (p < end) {
  91. if (g_ascii_isgraph (*p)) {
  92. *d++ = *p++;
  93. }
  94. else {
  95. *d++ = '?';
  96. p++;
  97. }
  98. }
  99. *d = '\0';
  100. MESSAGE_FIELD (task, message_id) = id;
  101. }
  102. break;
  103. }
  104. case 0xB91D3910358E8212ULL: /* subject */
  105. if (MESSAGE_FIELD (task, subject) == NULL) {
  106. MESSAGE_FIELD (task, subject) = rh->decoded;
  107. }
  108. rh->flags = RSPAMD_HEADER_SUBJECT|RSPAMD_HEADER_UNIQUE;
  109. break;
  110. case 0xEE4AA2EAAC61D6F4ULL: /* return-path */
  111. if (task->from_envelope == NULL) {
  112. task->from_envelope = rspamd_email_address_from_smtp (rh->decoded,
  113. strlen (rh->decoded));
  114. }
  115. rh->flags = RSPAMD_HEADER_RETURN_PATH|RSPAMD_HEADER_UNIQUE;
  116. break;
  117. case 0xB9EEFAD2E93C2161ULL: /* delivered-to */
  118. if (task->deliver_to == NULL) {
  119. task->deliver_to = rh->decoded;
  120. }
  121. rh->flags = RSPAMD_HEADER_DELIVERED_TO;
  122. break;
  123. case 0x2EC3BFF3C393FC10ULL: /* date */
  124. case 0xAC0DDB1A1D214CAULL: /* sender */
  125. case 0x54094572367AB695ULL: /* in-reply-to */
  126. case 0x81CD9E9131AB6A9AULL: /* content-type */
  127. case 0xC39BD9A75AA25B60ULL: /* content-transfer-encoding */
  128. case 0xB3F6704CB3AD6589ULL: /* references */
  129. rh->flags = RSPAMD_HEADER_UNIQUE;
  130. break;
  131. }
  132. }
  133. static void
  134. rspamd_mime_header_add (struct rspamd_task *task,
  135. khash_t(rspamd_mime_headers_htb) *target,
  136. struct rspamd_mime_header **order_ptr,
  137. struct rspamd_mime_header *rh,
  138. gboolean check_special)
  139. {
  140. khiter_t k;
  141. struct rspamd_mime_header *ex;
  142. int res;
  143. k = kh_put (rspamd_mime_headers_htb, target, rh->name, &res);
  144. if (res == 0) {
  145. ex = kh_value (target, k);
  146. DL_APPEND (ex, rh);
  147. msg_debug_task ("append raw header %s: %s", rh->name, rh->value);
  148. }
  149. else {
  150. kh_value (target, k) = rh;
  151. rh->prev = rh;
  152. rh->next = NULL;
  153. msg_debug_task ("add new raw header %s: %s", rh->name, rh->value);
  154. }
  155. LL_PREPEND2 (*order_ptr, rh, ord_next);
  156. if (check_special) {
  157. rspamd_mime_header_check_special (task, rh);
  158. }
  159. }
  160. /* Convert raw headers to a list of struct raw_header * */
  161. void
  162. rspamd_mime_headers_process (struct rspamd_task *task,
  163. struct rspamd_mime_headers_table *target,
  164. struct rspamd_mime_header **order_ptr,
  165. const gchar *in, gsize len,
  166. gboolean check_newlines)
  167. {
  168. struct rspamd_mime_header *nh = NULL;
  169. const gchar *p, *c, *end;
  170. gchar *tmp, *tp;
  171. gint state = 0, l, next_state = 100, err_state = 100, t_state;
  172. gboolean valid_folding = FALSE;
  173. guint nlines_count[RSPAMD_TASK_NEWLINES_MAX];
  174. guint norder = 0;
  175. p = in;
  176. end = p + len;
  177. c = p;
  178. memset (nlines_count, 0, sizeof (nlines_count));
  179. msg_debug_task ("start processing headers");
  180. while (p < end) {
  181. /* FSM for processing headers */
  182. switch (state) {
  183. case 0:
  184. /* Begin processing headers */
  185. if (!g_ascii_isalpha (*p)) {
  186. /* We have some garbage at the beginning of headers, skip this line */
  187. state = 100;
  188. next_state = 0;
  189. }
  190. else {
  191. state = 1;
  192. c = p;
  193. }
  194. break;
  195. case 1:
  196. /* We got something like header's name */
  197. if (*p == ':') {
  198. nh = rspamd_mempool_alloc0 (task->task_pool,
  199. sizeof (struct rspamd_mime_header));
  200. l = p - c;
  201. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  202. rspamd_null_safe_copy (c, l, tmp, l + 1);
  203. nh->name = tmp;
  204. nh->flags |= RSPAMD_HEADER_EMPTY_SEPARATOR;
  205. nh->raw_value = c;
  206. nh->raw_len = p - c; /* Including trailing ':' */
  207. p++;
  208. state = 2;
  209. c = p;
  210. }
  211. else if (g_ascii_isspace (*p)) {
  212. /* Not header but some garbage */
  213. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  214. state = 100;
  215. next_state = 0;
  216. }
  217. else {
  218. p++;
  219. }
  220. break;
  221. case 2:
  222. /* We got header's name, so skip any \t or spaces */
  223. if (*p == '\t') {
  224. nh->flags &= ~RSPAMD_HEADER_EMPTY_SEPARATOR;
  225. nh->flags |= RSPAMD_HEADER_TAB_SEPARATED;
  226. p++;
  227. }
  228. else if (*p == ' ') {
  229. nh->flags &= ~RSPAMD_HEADER_EMPTY_SEPARATOR;
  230. p++;
  231. }
  232. else if (*p == '\n' || *p == '\r') {
  233. if (check_newlines) {
  234. if (*p == '\n') {
  235. nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
  236. }
  237. else if (p + 1 < end && *(p + 1) == '\n') {
  238. nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
  239. }
  240. else {
  241. nlines_count[RSPAMD_TASK_NEWLINES_CR] ++;
  242. }
  243. }
  244. /* Process folding */
  245. state = 99;
  246. l = p - c;
  247. if (l > 0) {
  248. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  249. rspamd_null_safe_copy (c, l, tmp, l + 1);
  250. nh->separator = tmp;
  251. }
  252. next_state = 3;
  253. err_state = 5;
  254. c = p;
  255. }
  256. else {
  257. /* Process value */
  258. l = p - c;
  259. if (l >= 0) {
  260. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  261. rspamd_null_safe_copy (c, l, tmp, l + 1);
  262. nh->separator = tmp;
  263. }
  264. c = p;
  265. state = 3;
  266. }
  267. break;
  268. case 3:
  269. if (*p == '\r' || *p == '\n') {
  270. /* Hold folding */
  271. if (check_newlines) {
  272. if (*p == '\n') {
  273. nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
  274. }
  275. else if (p + 1 < end && *(p + 1) == '\n') {
  276. nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
  277. }
  278. else {
  279. nlines_count[RSPAMD_TASK_NEWLINES_CR] ++;
  280. }
  281. }
  282. state = 99;
  283. next_state = 3;
  284. err_state = 4;
  285. }
  286. else if (p + 1 == end) {
  287. state = 4;
  288. }
  289. else {
  290. p++;
  291. }
  292. break;
  293. case 4:
  294. /* Copy header's value */
  295. /*
  296. * XXX:
  297. * The original decision to use here null terminated
  298. * strings was extremely poor!
  299. */
  300. l = p - c;
  301. tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
  302. tp = tmp;
  303. t_state = 0;
  304. while (l--) {
  305. if (t_state == 0) {
  306. /* Before folding */
  307. if (*c == '\n' || *c == '\r') {
  308. t_state = 1;
  309. c++;
  310. *tp++ = ' ';
  311. }
  312. else {
  313. if (*c != '\0') {
  314. *tp++ = *c++;
  315. }
  316. else {
  317. c++;
  318. }
  319. }
  320. }
  321. else if (t_state == 1) {
  322. /* Inside folding */
  323. if (g_ascii_isspace (*c)) {
  324. c++;
  325. }
  326. else {
  327. t_state = 0;
  328. if (*c != '\0') {
  329. *tp++ = *c++;
  330. }
  331. else {
  332. c++;
  333. }
  334. }
  335. }
  336. }
  337. /* Strip last space that can be added by \r\n parsing */
  338. if (tp > tmp && *(tp - 1) == ' ') {
  339. tp--;
  340. }
  341. *tp = '\0';
  342. /* Strip the initial spaces that could also be added by folding */
  343. while (*tmp != '\0' && g_ascii_isspace (*tmp)) {
  344. tmp ++;
  345. }
  346. if (p + 1 == end) {
  347. nh->raw_len = end - nh->raw_value;
  348. }
  349. else {
  350. nh->raw_len = p - nh->raw_value;
  351. }
  352. nh->value = tmp;
  353. gboolean broken_utf = FALSE;
  354. nh->decoded = rspamd_mime_header_decode (task->task_pool,
  355. nh->value, strlen (tmp), &broken_utf);
  356. if (broken_utf) {
  357. task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
  358. }
  359. if (nh->decoded == NULL) {
  360. /* As we strip comments in place... */
  361. nh->decoded = rspamd_mempool_strdup (task->task_pool, "");
  362. }
  363. /* We also validate utf8 and replace all non-valid utf8 chars */
  364. rspamd_mime_charset_utf_enforce (nh->decoded, strlen (nh->decoded));
  365. nh->order = norder ++;
  366. rspamd_mime_header_add (task, &target->htb, order_ptr, nh, check_newlines);
  367. nh = NULL;
  368. state = 0;
  369. break;
  370. case 5:
  371. /* Header has only name, no value */
  372. nh->value = rspamd_mempool_strdup (task->task_pool, "");;
  373. nh->decoded = rspamd_mempool_strdup (task->task_pool, "");;
  374. nh->raw_len = p - nh->raw_value;
  375. nh->order = norder ++;
  376. rspamd_mime_header_add (task, &target->htb, order_ptr, nh, check_newlines);
  377. nh = NULL;
  378. state = 0;
  379. break;
  380. case 99:
  381. /* Folding state */
  382. if (p + 1 == end) {
  383. state = err_state;
  384. }
  385. else {
  386. if (*p == '\r' || *p == '\n') {
  387. p++;
  388. valid_folding = FALSE;
  389. }
  390. else if (*p == '\t' || *p == ' ') {
  391. /* Valid folding */
  392. p++;
  393. valid_folding = TRUE;
  394. }
  395. else {
  396. if (valid_folding) {
  397. debug_task ("go to state: %d->%d", state, next_state);
  398. state = next_state;
  399. }
  400. else {
  401. /* Fall back */
  402. debug_task ("go to state: %d->%d", state, err_state);
  403. state = err_state;
  404. }
  405. }
  406. }
  407. break;
  408. case 100:
  409. /* Fail state, skip line */
  410. if (*p == '\r') {
  411. if (p + 1 < end && *(p + 1) == '\n') {
  412. nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
  413. p++;
  414. }
  415. p++;
  416. state = next_state;
  417. }
  418. else if (*p == '\n') {
  419. nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
  420. if (p + 1 < end && *(p + 1) == '\r') {
  421. p++;
  422. }
  423. p++;
  424. state = next_state;
  425. }
  426. else if (p + 1 == end) {
  427. state = next_state;
  428. p++;
  429. }
  430. else {
  431. p++;
  432. }
  433. break;
  434. }
  435. }
  436. /* Since we have prepended headers, we need to reverse the list to get the actual order */
  437. LL_REVERSE (*order_ptr);
  438. if (check_newlines) {
  439. guint max_cnt = 0;
  440. gint sel = 0;
  441. rspamd_cryptobox_hash_state_t hs;
  442. guchar hout[rspamd_cryptobox_HASHBYTES], *hexout;
  443. for (gint i = RSPAMD_TASK_NEWLINES_CR; i < RSPAMD_TASK_NEWLINES_MAX; i ++) {
  444. if (nlines_count[i] > max_cnt) {
  445. max_cnt = nlines_count[i];
  446. sel = i;
  447. }
  448. }
  449. MESSAGE_FIELD (task, nlines_type) = sel;
  450. rspamd_cryptobox_hash_init (&hs, NULL, 0);
  451. LL_FOREACH (*order_ptr, nh) {
  452. if (nh->name && nh->flags != RSPAMD_HEADER_RECEIVED) {
  453. rspamd_cryptobox_hash_update (&hs, nh->name, strlen (nh->name));
  454. }
  455. }
  456. rspamd_cryptobox_hash_final (&hs, hout);
  457. hexout = rspamd_mempool_alloc (task->task_pool, sizeof (hout) * 2 + 1);
  458. hexout[sizeof (hout) * 2] = '\0';
  459. rspamd_encode_hex_buf (hout, sizeof (hout), hexout,
  460. sizeof (hout) * 2 + 1);
  461. rspamd_mempool_set_variable (task->task_pool,
  462. RSPAMD_MEMPOOL_HEADERS_HASH,
  463. hexout, NULL);
  464. }
  465. }
  466. static void
  467. rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool,
  468. GString *out,
  469. GByteArray *token,
  470. GByteArray *decoded_token,
  471. rspamd_ftok_t *old_charset,
  472. rspamd_ftok_t *new_charset)
  473. {
  474. if (new_charset->len == 0) {
  475. g_assert_not_reached ();
  476. }
  477. if (old_charset->len > 0) {
  478. if (rspamd_ftok_casecmp (new_charset, old_charset) == 0) {
  479. rspamd_ftok_t srch;
  480. /*
  481. * Special case for iso-2022-jp:
  482. * https://github.com/vstakhov/rspamd/issues/1669
  483. */
  484. RSPAMD_FTOK_ASSIGN (&srch, "iso-2022-jp");
  485. if (rspamd_ftok_casecmp (new_charset, &srch) != 0) {
  486. /* We can concatenate buffers, just return */
  487. return;
  488. }
  489. }
  490. }
  491. /* We need to flush and decode old token to out string */
  492. if (rspamd_mime_to_utf8_byte_array (token, decoded_token, pool,
  493. rspamd_mime_detect_charset (new_charset, pool))) {
  494. g_string_append_len (out, decoded_token->data, decoded_token->len);
  495. }
  496. /* We also reset buffer */
  497. g_byte_array_set_size (token, 0);
  498. /*
  499. * Propagate charset
  500. *
  501. * Here are dragons: we save the original charset to allow buffers concat
  502. * in the condition at the beginning of the function.
  503. * However, it will likely cause unnecessary calls for
  504. * `rspamd_mime_detect_charset` which could be relatively expensive.
  505. * But we ignore that for now...
  506. */
  507. memcpy (old_charset, new_charset, sizeof (*old_charset));
  508. }
  509. static void
  510. rspamd_mime_header_sanity_check (GString *str)
  511. {
  512. gsize i;
  513. gchar t;
  514. for (i = 0; i < str->len; i ++) {
  515. t = str->str[i];
  516. if (!((t & 0x80) || g_ascii_isgraph (t))) {
  517. if (g_ascii_isspace (t)) {
  518. /* Replace spaces characters with plain space */
  519. str->str[i] = ' ';
  520. }
  521. else {
  522. str->str[i] = '?';
  523. }
  524. }
  525. }
  526. }
  527. gchar *
  528. rspamd_mime_header_decode (rspamd_mempool_t *pool, const gchar *in,
  529. gsize inlen, gboolean *invalid_utf)
  530. {
  531. GString *out;
  532. const guchar *c, *p, *end;
  533. const gchar *tok_start = NULL;
  534. gsize tok_len = 0, pos;
  535. GByteArray *token = NULL, *decoded;
  536. rspamd_ftok_t cur_charset = {0, NULL}, old_charset = {0, NULL};
  537. gint encoding;
  538. gssize r;
  539. guint qmarks = 0;
  540. gchar *ret;
  541. enum {
  542. parse_normal = 0,
  543. got_eqsign,
  544. got_encoded_start,
  545. got_more_qmark,
  546. skip_spaces,
  547. } state = parse_normal;
  548. g_assert (in != NULL);
  549. c = in;
  550. p = in;
  551. end = in + inlen;
  552. out = g_string_sized_new (inlen);
  553. token = g_byte_array_sized_new (80);
  554. decoded = g_byte_array_sized_new (122);
  555. while (p < end) {
  556. switch (state) {
  557. case parse_normal:
  558. if (*p == '=') {
  559. g_string_append_len (out, c, p - c);
  560. c = p;
  561. state = got_eqsign;
  562. }
  563. else if (*p >= 128) {
  564. gint off = 0;
  565. UChar32 uc;
  566. /* Unencoded character */
  567. g_string_append_len (out, c, p - c);
  568. /* Check if that's valid UTF8 */
  569. U8_NEXT (p, off, end - p, uc);
  570. if (uc <= 0) {
  571. c = p + 1;
  572. /* 0xFFFD in UTF8 */
  573. g_string_append_len (out, " ", 3);
  574. off = 0;
  575. U8_APPEND_UNSAFE (out->str + out->len - 3,
  576. off, 0xfffd);
  577. if (invalid_utf) {
  578. *invalid_utf = TRUE;
  579. }
  580. }
  581. else {
  582. c = p;
  583. p = p + off;
  584. continue; /* To avoid p ++ after this block */
  585. }
  586. }
  587. p ++;
  588. break;
  589. case got_eqsign:
  590. if (*p == '?') {
  591. state = got_encoded_start;
  592. qmarks = 0;
  593. }
  594. else {
  595. g_string_append_len (out, c, 1);
  596. c = p;
  597. state = parse_normal;
  598. continue; /* Deal with == case */
  599. }
  600. p ++;
  601. break;
  602. case got_encoded_start:
  603. if (*p == '?') {
  604. state = got_more_qmark;
  605. qmarks ++;
  606. /* Skip multiple ? signs */
  607. p ++;
  608. while (p < end && *p == '?') {
  609. p ++;
  610. }
  611. continue;
  612. }
  613. p ++;
  614. break;
  615. case got_more_qmark:
  616. if (*p == '=') {
  617. if (qmarks < 3) {
  618. state = got_encoded_start;
  619. }
  620. else {
  621. /* Finished encoded boundary */
  622. if (*c == '"') {
  623. /* Quoted string, non-RFC conformant but used by retards */
  624. c ++;
  625. }
  626. if (rspamd_rfc2047_parser (c, p - c + 1, &encoding,
  627. &cur_charset.begin, &cur_charset.len,
  628. &tok_start, &tok_len)) {
  629. /* We have a token, so we can decode it from `encoding` */
  630. if (token->len > 0) {
  631. if (old_charset.len == 0) {
  632. memcpy (&old_charset, &cur_charset,
  633. sizeof (old_charset));
  634. }
  635. rspamd_mime_header_maybe_save_token (pool, out,
  636. token, decoded,
  637. &old_charset, &cur_charset);
  638. }
  639. qmarks = 0;
  640. pos = token->len;
  641. g_byte_array_set_size (token, pos + tok_len);
  642. if (encoding == RSPAMD_RFC2047_QP) {
  643. r = rspamd_decode_qp2047_buf (tok_start, tok_len,
  644. token->data + pos, tok_len);
  645. if (r != -1) {
  646. token->len = pos + r;
  647. } else {
  648. /* Cannot decode qp */
  649. token->len -= tok_len;
  650. }
  651. } else {
  652. if (rspamd_cryptobox_base64_decode (tok_start, tok_len,
  653. token->data + pos, &tok_len)) {
  654. token->len = pos + tok_len;
  655. } else {
  656. /* Cannot decode */
  657. token->len -= tok_len;
  658. }
  659. }
  660. c = p + 1;
  661. state = skip_spaces;
  662. } else {
  663. /* Not encoded-word */
  664. old_charset.len = 0;
  665. if (token->len > 0) {
  666. rspamd_mime_header_maybe_save_token (pool, out,
  667. token, decoded,
  668. &old_charset, &cur_charset);
  669. }
  670. g_string_append_len (out, c, p - c);
  671. c = p;
  672. state = parse_normal;
  673. }
  674. } /* qmarks >= 3 */
  675. } /* p == '=' */
  676. else {
  677. state = got_encoded_start;
  678. }
  679. p ++;
  680. break;
  681. case skip_spaces:
  682. if (g_ascii_isspace (*p)) {
  683. p ++;
  684. }
  685. else if (*p == '=' && p < end - 1 && p[1] == '?') {
  686. /* Next boundary, can glue */
  687. c = p;
  688. p += 2;
  689. state = got_encoded_start;
  690. }
  691. else {
  692. /* Need to save spaces and decoded token */
  693. if (token->len > 0) {
  694. old_charset.len = 0;
  695. rspamd_mime_header_maybe_save_token (pool, out,
  696. token, decoded,
  697. &old_charset, &cur_charset);
  698. }
  699. g_string_append_len (out, c, p - c);
  700. c = p;
  701. state = parse_normal;
  702. }
  703. break;
  704. }
  705. }
  706. /* Leftover */
  707. switch (state) {
  708. case skip_spaces:
  709. if (token->len > 0 && cur_charset.len > 0) {
  710. old_charset.len = 0;
  711. rspamd_mime_header_maybe_save_token (pool, out,
  712. token, decoded,
  713. &old_charset, &cur_charset);
  714. }
  715. break;
  716. default:
  717. /* Just copy leftover */
  718. if (p > c) {
  719. g_string_append_len (out, c, p - c);
  720. }
  721. break;
  722. }
  723. g_byte_array_free (token, TRUE);
  724. g_byte_array_free (decoded, TRUE);
  725. rspamd_mime_header_sanity_check (out);
  726. rspamd_mempool_notify_alloc (pool, out->len);
  727. ret = g_string_free (out, FALSE);
  728. rspamd_mempool_add_destructor (pool, g_free, ret);
  729. return ret;
  730. }
  731. gchar *
  732. rspamd_mime_header_encode (const gchar *in, gsize len)
  733. {
  734. const gchar *p = in, *end = in + len;
  735. gchar *out, encode_buf[80 * sizeof (guint32)];
  736. GString *res;
  737. gboolean need_encoding = FALSE;
  738. /* Check if we need to encode */
  739. while (p < end) {
  740. if ((((guchar)*p) & 0x80) != 0) {
  741. need_encoding = TRUE;
  742. break;
  743. }
  744. p ++;
  745. }
  746. if (!need_encoding) {
  747. out = g_malloc (len + 1);
  748. rspamd_strlcpy (out, in, len + 1);
  749. }
  750. else {
  751. /* Need encode */
  752. gsize ulen, pos;
  753. gint r;
  754. const gchar *prev;
  755. /* Choose step: =?UTF-8?Q?<qp>?= should be less than 76 chars */
  756. guint step = (76 - 12) / 3 + 1;
  757. ulen = g_utf8_strlen (in, len);
  758. res = g_string_sized_new (len * 2 + 1);
  759. pos = 0;
  760. prev = in;
  761. /* Adjust chunk size for unicode average length */
  762. step *= 1.0 * ulen / (gdouble)len;
  763. while (pos < ulen) {
  764. p = g_utf8_offset_to_pointer (in, pos);
  765. if (p > prev) {
  766. /* Encode and print */
  767. r = rspamd_encode_qp2047_buf (prev, p - prev,
  768. encode_buf, sizeof (encode_buf));
  769. if (r != -1) {
  770. if (res->len > 0) {
  771. rspamd_printf_gstring (res, " =?UTF-8?Q?%*s?=", r,
  772. encode_buf);
  773. }
  774. else {
  775. rspamd_printf_gstring (res, "=?UTF-8?Q?%*s?=", r,
  776. encode_buf);
  777. }
  778. }
  779. }
  780. pos += MIN (step, ulen - pos);
  781. prev = p;
  782. }
  783. /* Leftover */
  784. if (prev < end) {
  785. r = rspamd_encode_qp2047_buf (prev, end - prev,
  786. encode_buf, sizeof (encode_buf));
  787. if (r != -1) {
  788. if (res->len > 0) {
  789. rspamd_printf_gstring (res, " =?UTF-8?Q?%*s?=", r,
  790. encode_buf);
  791. }
  792. else {
  793. rspamd_printf_gstring (res, "=?UTF-8?Q?%*s?=", r,
  794. encode_buf);
  795. }
  796. }
  797. }
  798. out = g_string_free (res, FALSE);
  799. }
  800. return out;
  801. }
  802. gchar *
  803. rspamd_mime_message_id_generate (const gchar *fqdn)
  804. {
  805. GString *out;
  806. guint64 rnd, clk;
  807. out = g_string_sized_new (strlen (fqdn) + 22);
  808. rnd = ottery_rand_uint64 ();
  809. clk = rspamd_get_calendar_ticks () * 1e6;
  810. rspamd_printf_gstring (out, "%*bs.%*bs@%s",
  811. (gint)sizeof (guint64) - 3, (guchar *)&clk,
  812. (gint)sizeof (guint64), (gchar *)&rnd,
  813. fqdn);
  814. return g_string_free (out, FALSE);
  815. }
  816. struct rspamd_mime_header *
  817. rspamd_message_get_header_from_hash (struct rspamd_mime_headers_table *hdrs,
  818. const gchar *field,
  819. gboolean need_modified)
  820. {
  821. khiter_t k;
  822. khash_t(rspamd_mime_headers_htb) *htb = &hdrs->htb;
  823. struct rspamd_mime_header *hdr;
  824. if (htb) {
  825. k = kh_get (rspamd_mime_headers_htb, htb, (gchar *) field);
  826. if (k == kh_end (htb)) {
  827. return NULL;
  828. }
  829. hdr = kh_value (htb, k);
  830. if (!need_modified) {
  831. if (hdr->flags & RSPAMD_HEADER_NON_EXISTING) {
  832. return NULL;
  833. }
  834. return hdr;
  835. }
  836. else {
  837. if (hdr->flags & RSPAMD_HEADER_MODIFIED) {
  838. return hdr->modified_chain;
  839. }
  840. return hdr;
  841. }
  842. }
  843. return NULL;
  844. }
  845. struct rspamd_mime_header *
  846. rspamd_message_get_header_array (struct rspamd_task *task, const gchar *field,
  847. gboolean need_modified)
  848. {
  849. return rspamd_message_get_header_from_hash(
  850. MESSAGE_FIELD_CHECK (task, raw_headers),
  851. field, need_modified);
  852. }
  853. gsize
  854. rspamd_mime_headers_count (struct rspamd_mime_headers_table *hdrs)
  855. {
  856. if (hdrs) {
  857. return kh_size (&hdrs->htb);
  858. }
  859. return 0;
  860. }
  861. bool
  862. rspamd_mime_headers_foreach(const struct rspamd_mime_headers_table *hdrs,
  863. rspamd_hdr_traverse_func_t func, void *ud)
  864. {
  865. const gchar *name;
  866. struct rspamd_mime_header *hdr;
  867. kh_foreach(&hdrs->htb, name, hdr, {
  868. if (!func(name, hdr, ud)) {
  869. return false;
  870. }
  871. });
  872. return true;
  873. }
  874. static void
  875. rspamd_message_headers_dtor (struct rspamd_mime_headers_table *hdrs)
  876. {
  877. if (hdrs) {
  878. kfree (hdrs->htb.keys);
  879. kfree (hdrs->htb.vals);
  880. kfree (hdrs->htb.flags);
  881. g_free (hdrs);
  882. }
  883. }
  884. struct rspamd_mime_headers_table *
  885. rspamd_message_headers_ref (struct rspamd_mime_headers_table *hdrs)
  886. {
  887. REF_RETAIN (hdrs);
  888. return hdrs;
  889. }
  890. void
  891. rspamd_message_headers_unref (struct rspamd_mime_headers_table *hdrs)
  892. {
  893. REF_RELEASE (hdrs);
  894. }
  895. struct rspamd_mime_headers_table *
  896. rspamd_message_headers_new (void)
  897. {
  898. struct rspamd_mime_headers_table *nhdrs;
  899. nhdrs = g_malloc0 (sizeof (*nhdrs));
  900. REF_INIT_RETAIN (nhdrs, rspamd_message_headers_dtor);
  901. return nhdrs;
  902. }
  903. void
  904. rspamd_message_set_modified_header (struct rspamd_task *task,
  905. struct rspamd_mime_headers_table *hdrs,
  906. const gchar *hdr_name,
  907. const ucl_object_t *obj)
  908. {
  909. khiter_t k;
  910. khash_t(rspamd_mime_headers_htb) *htb = &hdrs->htb;
  911. struct rspamd_mime_header *hdr_elt, *existing_chain;
  912. int i;
  913. if (htb) {
  914. k = kh_get (rspamd_mime_headers_htb, htb, (gchar *)hdr_name);
  915. if (k == kh_end (htb)) {
  916. hdr_elt = rspamd_mempool_alloc0 (task->task_pool, sizeof (*hdr_elt));
  917. hdr_elt->flags |= RSPAMD_HEADER_MODIFIED|RSPAMD_HEADER_NON_EXISTING;
  918. hdr_elt->name = rspamd_mempool_strdup (task->task_pool, hdr_name);
  919. int r;
  920. k = kh_put (rspamd_mime_headers_htb, htb, hdr_elt->name, &r);
  921. kh_value (htb, k) = hdr_elt;
  922. }
  923. else {
  924. hdr_elt = kh_value (htb, k);
  925. }
  926. }
  927. else {
  928. /* No hash, no modification */
  929. msg_err_task ("internal error: calling for set_modified_header for no headers");
  930. return;
  931. }
  932. if (hdr_elt->flags & RSPAMD_HEADER_MODIFIED) {
  933. existing_chain = hdr_elt->modified_chain;
  934. }
  935. else {
  936. existing_chain = hdr_elt;
  937. }
  938. const ucl_object_t *elt, *cur;
  939. ucl_object_iter_t it;
  940. /* First, deal with removed headers, copying the relevant headers with remove flag */
  941. elt = ucl_object_lookup (obj, "remove");
  942. /*
  943. * remove: {1, 2 ...}
  944. * where number is the header's position starting from '1'
  945. */
  946. if (elt && ucl_object_type (elt) == UCL_ARRAY) {
  947. /* First, use a temporary array to keep all headers */
  948. GPtrArray *existing_ar = g_ptr_array_new ();
  949. struct rspamd_mime_header *cur_hdr;
  950. /* Exclude removed headers */
  951. LL_FOREACH (existing_chain, cur_hdr) {
  952. if (!(cur_hdr->flags & RSPAMD_HEADER_REMOVED)) {
  953. g_ptr_array_add (existing_ar, cur_hdr);
  954. }
  955. }
  956. it = NULL;
  957. while ((cur = ucl_object_iterate (elt, &it, true)) != NULL) {
  958. if (ucl_object_type (cur) == UCL_INT) {
  959. int ord = ucl_object_toint (cur);
  960. if (ord == 0) {
  961. /* Remove all headers in the existing chain */
  962. PTR_ARRAY_FOREACH (existing_ar, i, cur_hdr) {
  963. cur_hdr->flags |= RSPAMD_HEADER_MODIFIED|RSPAMD_HEADER_REMOVED;
  964. }
  965. }
  966. else if (ord > 0) {
  967. /* Start from the top */
  968. if (ord <= existing_ar->len) {
  969. cur_hdr = g_ptr_array_index (existing_ar, ord - 1);
  970. cur_hdr->flags |= RSPAMD_HEADER_MODIFIED|RSPAMD_HEADER_REMOVED;
  971. }
  972. }
  973. else {
  974. /* Start from the bottom; ord < 0 */
  975. if ((-ord) <= existing_ar->len) {
  976. cur_hdr = g_ptr_array_index (existing_ar, existing_ar->len + ord);
  977. cur_hdr->flags |= RSPAMD_HEADER_MODIFIED|RSPAMD_HEADER_REMOVED;
  978. }
  979. }
  980. }
  981. }
  982. /*
  983. * Next, we return all headers modified to the existing chain
  984. * This implies an additional copy of all structures but is safe enough to
  985. * deal with it
  986. */
  987. hdr_elt->flags |= RSPAMD_HEADER_MODIFIED;
  988. hdr_elt->modified_chain = NULL;
  989. gint new_chain_length = 0;
  990. PTR_ARRAY_FOREACH (existing_ar, i, cur_hdr) {
  991. if (!(cur_hdr->flags & RSPAMD_HEADER_REMOVED)) {
  992. struct rspamd_mime_header *nhdr = rspamd_mempool_alloc (
  993. task->task_pool, sizeof (*nhdr));
  994. memcpy (nhdr, cur_hdr, sizeof (*nhdr));
  995. nhdr->modified_chain = NULL;
  996. nhdr->prev = NULL;
  997. nhdr->next = NULL;
  998. nhdr->ord_next = NULL;
  999. DL_APPEND (hdr_elt->modified_chain, nhdr);
  1000. new_chain_length ++;
  1001. }
  1002. }
  1003. g_ptr_array_free (existing_ar, TRUE);
  1004. /* End of headers removal logic */
  1005. }
  1006. /* We can now deal with headers additions */
  1007. elt = ucl_object_lookup (obj, "add");
  1008. if (elt && ucl_object_type (elt) == UCL_ARRAY) {
  1009. if (!(hdr_elt->flags & RSPAMD_HEADER_MODIFIED)) {
  1010. /* Copy the header itself to the modified chain */
  1011. struct rspamd_mime_header *nhdr;
  1012. hdr_elt->flags |= RSPAMD_HEADER_MODIFIED;
  1013. nhdr = rspamd_mempool_alloc (
  1014. task->task_pool, sizeof (*nhdr));
  1015. memcpy (nhdr, hdr_elt, sizeof (*hdr_elt));
  1016. nhdr->modified_chain = NULL;
  1017. nhdr->next = NULL;
  1018. nhdr->ord_next = NULL;
  1019. nhdr->prev = nhdr;
  1020. hdr_elt->modified_chain = nhdr;
  1021. }
  1022. /*
  1023. * add: {{1, "foo"}, {-1, "bar"} ...}
  1024. * where number is the header's position starting from '1'
  1025. */
  1026. it = NULL;
  1027. while ((cur = ucl_object_iterate (elt, &it, true)) != NULL) {
  1028. if (ucl_object_type (cur) == UCL_ARRAY) {
  1029. const ucl_object_t *order = ucl_array_find_index (cur, 0),
  1030. *value = ucl_array_find_index (cur, 1);
  1031. if (order && value &&
  1032. (ucl_object_type (order) == UCL_INT &&
  1033. ucl_object_type (value) == UCL_STRING)) {
  1034. int ord = ucl_object_toint (order);
  1035. const char *raw_value;
  1036. gsize raw_len;
  1037. raw_value = ucl_object_tolstring (value, &raw_len);
  1038. if (raw_len == 0) {
  1039. continue;
  1040. }
  1041. struct rspamd_mime_header *nhdr = rspamd_mempool_alloc0 (
  1042. task->task_pool, sizeof (*nhdr));
  1043. nhdr->flags |= RSPAMD_HEADER_ADDED;
  1044. nhdr->name = hdr_elt->name;
  1045. nhdr->value = rspamd_mempool_alloc (task->task_pool,
  1046. raw_len + 1);
  1047. nhdr->raw_len = rspamd_strlcpy (nhdr->value, raw_value,
  1048. raw_len + 1);
  1049. nhdr->raw_value = nhdr->value;
  1050. nhdr->decoded = rspamd_mime_header_decode (task->task_pool,
  1051. raw_value, raw_len, NULL);
  1052. /* Now find a position to insert a value */
  1053. struct rspamd_mime_header **pos = &hdr_elt->modified_chain;
  1054. if (ord == 0) {
  1055. DL_PREPEND (hdr_elt->modified_chain, nhdr);
  1056. }
  1057. else if (ord == -1) {
  1058. DL_APPEND (hdr_elt->modified_chain, nhdr);
  1059. }
  1060. else if (ord > 0) {
  1061. while (ord > 0 && (*pos)) {
  1062. ord --;
  1063. pos = &((*pos)->next);
  1064. }
  1065. if (*pos) {
  1066. /* pos is &(elt)->next */
  1067. nhdr->next = (*pos);
  1068. nhdr->prev = (*pos)->prev;
  1069. (*pos)->prev = nhdr;
  1070. *pos = nhdr;
  1071. }
  1072. else {
  1073. /* Last element */
  1074. DL_APPEND (*pos, nhdr);
  1075. }
  1076. }
  1077. else {
  1078. /* NYI: negative order is not defined */
  1079. msg_err_task ("internal error: calling for set_modified_header "
  1080. "with negative add order header");
  1081. }
  1082. }
  1083. else {
  1084. msg_err_task ("internal error: calling for set_modified_header "
  1085. "with invalid header");
  1086. }
  1087. }
  1088. }
  1089. }
  1090. }
  1091. gsize
  1092. rspamd_strip_smtp_comments_inplace (gchar *input, gsize len)
  1093. {
  1094. enum parser_state {
  1095. parse_normal,
  1096. parse_obrace,
  1097. parse_comment,
  1098. parse_quoted_copy,
  1099. parse_quoted_ignore,
  1100. } state = parse_normal, next_state = parse_normal;
  1101. gchar *d = input, *end = input + len, *start = input;
  1102. gchar t;
  1103. int obraces = 0, ebraces = 0;
  1104. while (input < end) {
  1105. t = *input;
  1106. switch (state) {
  1107. case parse_normal:
  1108. if (t == '(') {
  1109. state = parse_obrace;
  1110. }
  1111. else if (t == '\\') {
  1112. state = parse_quoted_copy;
  1113. next_state = parse_normal;
  1114. }
  1115. else {
  1116. *d++ = t;
  1117. }
  1118. input ++;
  1119. break;
  1120. case parse_obrace:
  1121. obraces ++;
  1122. if (t == '(') {
  1123. obraces ++;
  1124. }
  1125. else if (t == ')') {
  1126. ebraces ++;
  1127. if (obraces == ebraces) {
  1128. obraces = 0;
  1129. ebraces = 0;
  1130. state = parse_normal;
  1131. }
  1132. }
  1133. else if (t == '\\') {
  1134. state = parse_quoted_ignore;
  1135. next_state = parse_comment;
  1136. }
  1137. else {
  1138. state = parse_comment;
  1139. }
  1140. input ++;
  1141. break;
  1142. case parse_comment:
  1143. if (t == '(') {
  1144. state = parse_obrace;
  1145. }
  1146. else if (t == ')') {
  1147. ebraces ++;
  1148. if (obraces == ebraces) {
  1149. obraces = 0;
  1150. ebraces = 0;
  1151. state = parse_normal;
  1152. }
  1153. }
  1154. else if (t == '\\') {
  1155. state = parse_quoted_ignore;
  1156. next_state = parse_comment;
  1157. }
  1158. input ++;
  1159. break;
  1160. case parse_quoted_copy:
  1161. *d++ = t;
  1162. state = next_state;
  1163. input ++;
  1164. break;
  1165. case parse_quoted_ignore:
  1166. state = next_state;
  1167. input ++;
  1168. break;
  1169. }
  1170. }
  1171. return (d - start);
  1172. }