You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

email_addr.c 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "email_addr.h"
  18. #include "message.h"
  19. #include "printf.h"
  20. #include "smtp_parsers.h"
  21. static void
  22. rspamd_email_address_unescape(struct rspamd_email_address *addr)
  23. {
  24. const char *h, *end;
  25. char *t, *d;
  26. if (addr->user_len == 0) {
  27. return;
  28. }
  29. d = g_malloc(addr->user_len);
  30. t = d;
  31. h = addr->user;
  32. end = h + addr->user_len;
  33. while (h < end) {
  34. if (*h != '\\') {
  35. *t++ = *h;
  36. }
  37. h++;
  38. }
  39. addr->user = d;
  40. addr->user_len = t - d;
  41. addr->flags |= RSPAMD_EMAIL_ADDR_USER_ALLOCATED;
  42. }
  43. struct rspamd_email_address *
  44. rspamd_email_address_from_smtp(const char *str, unsigned int len)
  45. {
  46. struct rspamd_email_address addr, *ret;
  47. gsize nlen;
  48. if (str == NULL || len == 0) {
  49. return NULL;
  50. }
  51. rspamd_smtp_addr_parse(str, len, &addr);
  52. if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
  53. ret = g_malloc(sizeof(*ret));
  54. memcpy(ret, &addr, sizeof(addr));
  55. if ((ret->flags & RSPAMD_EMAIL_ADDR_QUOTED) && ret->addr[0] == '"') {
  56. if (ret->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
  57. /* We also need to unquote user */
  58. rspamd_email_address_unescape(ret);
  59. }
  60. /* We need to unquote addr */
  61. nlen = ret->domain_len + ret->user_len + 2;
  62. ret->addr = g_malloc(nlen + 1);
  63. ret->addr_len = rspamd_snprintf((char *) ret->addr, nlen, "%*s@%*s",
  64. (int) ret->user_len, ret->user,
  65. (int) ret->domain_len, ret->domain);
  66. ret->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
  67. }
  68. return ret;
  69. }
  70. return NULL;
  71. }
  72. void rspamd_email_address_free(struct rspamd_email_address *addr)
  73. {
  74. if (addr) {
  75. if (addr->flags & RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED) {
  76. g_free((void *) addr->addr);
  77. }
  78. if (addr->flags & RSPAMD_EMAIL_ADDR_USER_ALLOCATED) {
  79. g_free((void *) addr->user);
  80. }
  81. g_free(addr);
  82. }
  83. }
  84. static inline void
  85. rspamd_email_address_add(rspamd_mempool_t *pool,
  86. GPtrArray *ar,
  87. struct rspamd_email_address *addr,
  88. GString *name)
  89. {
  90. struct rspamd_email_address *elt;
  91. unsigned int nlen;
  92. elt = g_malloc0(sizeof(*elt));
  93. rspamd_mempool_notify_alloc(pool, sizeof(*elt));
  94. if (addr != NULL) {
  95. memcpy(elt, addr, sizeof(*addr));
  96. }
  97. else {
  98. elt->addr = "";
  99. elt->domain = "";
  100. elt->raw = "<>";
  101. elt->raw_len = 2;
  102. elt->user = "";
  103. elt->flags |= RSPAMD_EMAIL_ADDR_EMPTY;
  104. }
  105. if ((elt->flags & RSPAMD_EMAIL_ADDR_QUOTED) && elt->addr[0] == '"') {
  106. if (elt->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
  107. /* We also need to unquote user */
  108. rspamd_email_address_unescape(elt);
  109. }
  110. /* We need to unquote addr */
  111. nlen = elt->domain_len + elt->user_len + 2;
  112. elt->addr = g_malloc(nlen + 1);
  113. rspamd_mempool_notify_alloc(pool, nlen + 1);
  114. elt->addr_len = rspamd_snprintf((char *) elt->addr, nlen, "%*s@%*s",
  115. (int) elt->user_len, elt->user,
  116. (int) elt->domain_len, elt->domain);
  117. elt->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
  118. }
  119. if (name->len > 0) {
  120. rspamd_gstring_strip(name, " \t\v");
  121. elt->name = rspamd_mime_header_decode(pool, name->str, name->len, NULL);
  122. }
  123. rspamd_mempool_notify_alloc(pool, name->len);
  124. g_ptr_array_add(ar, elt);
  125. }
  126. /*
  127. * Tries to parse an email address that doesn't conform RFC
  128. */
  129. static gboolean
  130. rspamd_email_address_parse_heuristic(const char *data, size_t len,
  131. struct rspamd_email_address *addr)
  132. {
  133. const char *p = data, *at = NULL, *end = data + len;
  134. gboolean ret = FALSE;
  135. memset(addr, 0, sizeof(*addr));
  136. if (*p == '<' && len > 1) {
  137. /* Angled address */
  138. addr->addr_len = rspamd_memcspn(p + 1, ">", len - 1);
  139. addr->addr = p + 1;
  140. addr->raw = p;
  141. addr->raw_len = len;
  142. ret = TRUE;
  143. p = p + 1;
  144. len = addr->addr_len;
  145. end = p + len;
  146. }
  147. else if (len > 0) {
  148. addr->addr = p;
  149. addr->addr_len = len;
  150. addr->raw = p;
  151. addr->raw_len = len;
  152. ret = TRUE;
  153. }
  154. if (ret) {
  155. at = rspamd_memrchr(p, '@', len);
  156. if (at != NULL && at + 1 < end) {
  157. addr->domain = at + 1;
  158. addr->domain_len = end - (at + 1);
  159. addr->user = p;
  160. addr->user_len = at - p;
  161. }
  162. if (rspamd_str_has_8bit(p, len)) {
  163. addr->flags |= RSPAMD_EMAIL_ADDR_HAS_8BIT;
  164. }
  165. }
  166. return ret;
  167. }
  168. static inline int
  169. rspamd_email_address_check_and_add(const char *start, gsize len,
  170. GPtrArray *res,
  171. rspamd_mempool_t *pool,
  172. GString *ns,
  173. int max_elements)
  174. {
  175. struct rspamd_email_address addr;
  176. g_assert(res != NULL);
  177. if (max_elements > 0 && res->len >= max_elements) {
  178. msg_info_pool_check("reached maximum number of elements %d when adding %v",
  179. max_elements,
  180. ns);
  181. return -1;
  182. }
  183. /* The whole email is likely address */
  184. memset(&addr, 0, sizeof(addr));
  185. rspamd_smtp_addr_parse(start, len, &addr);
  186. if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
  187. rspamd_email_address_add(pool, res, &addr, ns);
  188. }
  189. else {
  190. /* Try heuristic */
  191. if (rspamd_email_address_parse_heuristic(start,
  192. len, &addr)) {
  193. rspamd_email_address_add(pool, res, &addr, ns);
  194. return 1;
  195. }
  196. else {
  197. return 0;
  198. }
  199. }
  200. return 1;
  201. }
  202. GPtrArray *
  203. rspamd_email_address_from_mime(rspamd_mempool_t *pool, const char *hdr,
  204. unsigned int len,
  205. GPtrArray *src,
  206. int max_elements)
  207. {
  208. GPtrArray *res = src;
  209. gboolean seen_at = FALSE, seen_obrace = FALSE;
  210. const char *p = hdr, *end = hdr + len, *c = hdr, *t;
  211. GString *ns, *cpy;
  212. int obraces, ebraces;
  213. enum {
  214. parse_name = 0,
  215. parse_quoted,
  216. parse_addr,
  217. skip_spaces
  218. } state = parse_name,
  219. next_state = parse_name;
  220. if (res == NULL) {
  221. res = g_ptr_array_sized_new(2);
  222. rspamd_mempool_add_destructor(pool, rspamd_email_address_list_destroy,
  223. res);
  224. }
  225. else if (max_elements > 0 && res->len >= max_elements) {
  226. msg_info_pool_check("reached maximum number of elements %d", max_elements);
  227. return res;
  228. }
  229. ns = g_string_sized_new(len);
  230. cpy = g_string_sized_new(len);
  231. rspamd_mempool_add_destructor(pool, rspamd_gstring_free_hard, cpy);
  232. /* First, we need to remove all comments as they are terrible */
  233. obraces = 0;
  234. ebraces = 0;
  235. while (p < end) {
  236. if (state == parse_name) {
  237. if (*p == '\\') {
  238. if (obraces == 0) {
  239. g_string_append_c(cpy, *p);
  240. }
  241. p++;
  242. }
  243. else {
  244. if (*p == '"') {
  245. state = parse_quoted;
  246. }
  247. else if (*p == '(') {
  248. obraces++; /* To avoid ) itself being copied */
  249. }
  250. else if (*p == ')') {
  251. ebraces++;
  252. p++;
  253. }
  254. if (obraces == ebraces) {
  255. obraces = 0;
  256. ebraces = 0;
  257. }
  258. }
  259. if (p < end && obraces == 0) {
  260. g_string_append_c(cpy, *p);
  261. }
  262. }
  263. else {
  264. /* Quoted elt */
  265. if (*p == '\\') {
  266. g_string_append_c(cpy, *p);
  267. p++;
  268. }
  269. else {
  270. if (*p == '"') {
  271. state = parse_name;
  272. }
  273. }
  274. if (p < end) {
  275. g_string_append_c(cpy, *p);
  276. }
  277. }
  278. p++;
  279. }
  280. state = parse_name;
  281. p = cpy->str;
  282. c = p;
  283. end = p + cpy->len;
  284. while (p < end) {
  285. switch (state) {
  286. case parse_name:
  287. if (*p == '"') {
  288. /* We need to strip last spaces and update `ns` */
  289. if (p > c) {
  290. unsigned int nspaces = 0;
  291. t = p - 1;
  292. while (t > c && g_ascii_isspace(*t)) {
  293. t--;
  294. nspaces++;
  295. }
  296. g_string_append_len(ns, c, t - c + 1);
  297. if (nspaces > 0) {
  298. g_string_append_c(ns, ' ');
  299. }
  300. }
  301. state = parse_quoted;
  302. c = p + 1;
  303. }
  304. else if (*p == '<') {
  305. if (p > c) {
  306. t = p - 1;
  307. while (t > c && g_ascii_isspace(*t)) {
  308. t--;
  309. }
  310. g_string_append_len(ns, c, t - c + 1);
  311. }
  312. c = p;
  313. state = parse_addr;
  314. }
  315. else if (*p == ',') {
  316. if (p > c && seen_at) {
  317. /*
  318. * Last token must be the address:
  319. * e.g. Some name name@domain.com
  320. */
  321. t = p - 1;
  322. while (t > c && g_ascii_isspace(*t)) {
  323. t--;
  324. }
  325. int check = rspamd_email_address_check_and_add(c, t - c + 1,
  326. res, pool, ns, max_elements);
  327. if (check == 0 && res->len == 0) {
  328. /* Insert fake address */
  329. rspamd_email_address_add(pool, res, NULL, ns);
  330. }
  331. else if (check != 1) {
  332. goto end;
  333. }
  334. /* Cleanup for the next use */
  335. g_string_set_size(ns, 0);
  336. seen_at = FALSE;
  337. }
  338. state = skip_spaces;
  339. next_state = parse_name;
  340. }
  341. else if (*p == '@') {
  342. seen_at = TRUE;
  343. }
  344. p++;
  345. break;
  346. case parse_quoted:
  347. if (*p == '\\') {
  348. if (p > c) {
  349. g_string_append_len(ns, c, p - c);
  350. }
  351. p++;
  352. c = p;
  353. }
  354. else if (*p == '"') {
  355. if (p > c) {
  356. g_string_append_len(ns, c, p - c);
  357. }
  358. if (p + 1 < end && g_ascii_isspace(p[1])) {
  359. g_string_append_c(ns, ' ');
  360. }
  361. state = skip_spaces;
  362. next_state = parse_name;
  363. }
  364. else if (*p == '@' && seen_obrace) {
  365. seen_at = TRUE;
  366. }
  367. else if (*p == '<') {
  368. seen_obrace = TRUE;
  369. }
  370. p++;
  371. break;
  372. case parse_addr:
  373. if (*p == '>') {
  374. int check = rspamd_email_address_check_and_add(c, p - c + 1,
  375. res, pool, ns, max_elements);
  376. if (check == 0 && res->len == 0) {
  377. /* Insert a fake address */
  378. rspamd_email_address_add(pool, res, NULL, ns);
  379. }
  380. else if (check != 1) {
  381. goto end;
  382. }
  383. /* Cleanup for the next use */
  384. g_string_set_size(ns, 0);
  385. seen_at = FALSE;
  386. state = skip_spaces;
  387. next_state = parse_name;
  388. }
  389. else if (*p == '@') {
  390. seen_at = TRUE;
  391. }
  392. p++;
  393. break;
  394. case skip_spaces:
  395. if (!g_ascii_isspace(*p)) {
  396. c = p;
  397. state = next_state;
  398. }
  399. else {
  400. p++;
  401. }
  402. break;
  403. }
  404. }
  405. /* Handle leftover */
  406. switch (state) {
  407. case parse_name:
  408. /* Assume the whole header as name (bad thing) */
  409. if (p > c) {
  410. while (p > c && g_ascii_isspace(*p)) {
  411. p--;
  412. }
  413. if (p > c) {
  414. if (seen_at) {
  415. /* The whole email is likely address */
  416. int check = rspamd_email_address_check_and_add(c, p - c,
  417. res, pool, ns, max_elements);
  418. if (check == 0 && res->len == 0) {
  419. /* Insert a fake address */
  420. rspamd_email_address_add(pool, res, NULL, ns);
  421. }
  422. else if (check != 1) {
  423. goto end;
  424. }
  425. }
  426. else {
  427. /* No @ seen */
  428. g_string_append_len(ns, c, p - c);
  429. if (res->len == 0) {
  430. rspamd_email_address_add(pool, res, NULL, ns);
  431. }
  432. }
  433. }
  434. else if (res->len == 0) {
  435. rspamd_email_address_add(pool, res, NULL, ns);
  436. }
  437. }
  438. break;
  439. case parse_addr:
  440. if (p > c) {
  441. if (rspamd_email_address_check_and_add(c, p - c,
  442. res, pool, ns, max_elements) == 0) {
  443. if (res->len == 0) {
  444. rspamd_email_address_add(pool, res, NULL, ns);
  445. }
  446. }
  447. }
  448. break;
  449. case parse_quoted:
  450. /* Unfinished quoted string or a comment */
  451. /* If we have seen obrace + at, then we still can try to resolve address */
  452. if (seen_at && seen_obrace) {
  453. p = rspamd_memrchr(cpy->str, '<', cpy->len);
  454. g_assert(p != NULL);
  455. if (rspamd_email_address_check_and_add(p, end - p,
  456. res, pool, ns, max_elements) == 0) {
  457. if (res->len == 0) {
  458. rspamd_email_address_add(pool, res, NULL, ns);
  459. }
  460. }
  461. }
  462. break;
  463. default:
  464. /* Do nothing */
  465. break;
  466. }
  467. end:
  468. rspamd_mempool_notify_alloc(pool, cpy->len);
  469. g_string_free(ns, TRUE);
  470. return res;
  471. }
  472. void rspamd_email_address_list_destroy(gpointer ptr)
  473. {
  474. GPtrArray *ar = ptr;
  475. unsigned int i;
  476. struct rspamd_email_address *addr;
  477. PTR_ARRAY_FOREACH(ar, i, addr)
  478. {
  479. rspamd_email_address_free(addr);
  480. }
  481. g_ptr_array_free(ar, TRUE);
  482. }