123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "config.h"
- #include "email_addr.h"
- #include "message.h"
- #include "printf.h"
- #include "smtp_parsers.h"
-
- static void
- rspamd_email_address_unescape(struct rspamd_email_address *addr)
- {
- const char *h, *end;
- char *t, *d;
-
- if (addr->user_len == 0) {
- return;
- }
-
- d = g_malloc(addr->user_len);
- t = d;
- h = addr->user;
- end = h + addr->user_len;
-
- while (h < end) {
- if (*h != '\\') {
- *t++ = *h;
- }
- h++;
- }
-
- addr->user = d;
- addr->user_len = t - d;
- addr->flags |= RSPAMD_EMAIL_ADDR_USER_ALLOCATED;
- }
-
- struct rspamd_email_address *
- rspamd_email_address_from_smtp(const char *str, unsigned int len)
- {
- struct rspamd_email_address addr, *ret;
- gsize nlen;
-
- if (str == NULL || len == 0) {
- return NULL;
- }
-
- rspamd_smtp_addr_parse(str, len, &addr);
-
- if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
- ret = g_malloc(sizeof(*ret));
- memcpy(ret, &addr, sizeof(addr));
-
- if ((ret->flags & RSPAMD_EMAIL_ADDR_QUOTED) && ret->addr[0] == '"') {
- if (ret->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
- /* We also need to unquote user */
- rspamd_email_address_unescape(ret);
- }
-
- /* We need to unquote addr */
- nlen = ret->domain_len + ret->user_len + 2;
- ret->addr = g_malloc(nlen + 1);
- ret->addr_len = rspamd_snprintf((char *) ret->addr, nlen, "%*s@%*s",
- (int) ret->user_len, ret->user,
- (int) ret->domain_len, ret->domain);
- ret->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
- }
-
- return ret;
- }
-
- return NULL;
- }
-
- void rspamd_email_address_free(struct rspamd_email_address *addr)
- {
- if (addr) {
- if (addr->flags & RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED) {
- g_free((void *) addr->addr);
- }
-
- if (addr->flags & RSPAMD_EMAIL_ADDR_USER_ALLOCATED) {
- g_free((void *) addr->user);
- }
-
- g_free(addr);
- }
- }
-
- static inline void
- rspamd_email_address_add(rspamd_mempool_t *pool,
- GPtrArray *ar,
- struct rspamd_email_address *addr,
- GString *name)
- {
- struct rspamd_email_address *elt;
- unsigned int nlen;
-
- elt = g_malloc0(sizeof(*elt));
- rspamd_mempool_notify_alloc(pool, sizeof(*elt));
-
- if (addr != NULL) {
- memcpy(elt, addr, sizeof(*addr));
- }
- else {
- elt->addr = "";
- elt->domain = "";
- elt->raw = "<>";
- elt->raw_len = 2;
- elt->user = "";
- elt->flags |= RSPAMD_EMAIL_ADDR_EMPTY;
- }
-
- if ((elt->flags & RSPAMD_EMAIL_ADDR_QUOTED) && elt->addr[0] == '"') {
- if (elt->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
- /* We also need to unquote user */
- rspamd_email_address_unescape(elt);
- }
-
- /* We need to unquote addr */
- nlen = elt->domain_len + elt->user_len + 2;
- elt->addr = g_malloc(nlen + 1);
- rspamd_mempool_notify_alloc(pool, nlen + 1);
- elt->addr_len = rspamd_snprintf((char *) elt->addr, nlen, "%*s@%*s",
- (int) elt->user_len, elt->user,
- (int) elt->domain_len, elt->domain);
- elt->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
- }
-
- if (name->len > 0) {
- rspamd_gstring_strip(name, " \t\v");
- elt->name = rspamd_mime_header_decode(pool, name->str, name->len, NULL);
- }
-
- rspamd_mempool_notify_alloc(pool, name->len);
- g_ptr_array_add(ar, elt);
- }
-
- /*
- * Tries to parse an email address that doesn't conform RFC
- */
- static gboolean
- rspamd_email_address_parse_heuristic(const char *data, size_t len,
- struct rspamd_email_address *addr)
- {
- const char *p = data, *at = NULL, *end = data + len;
- gboolean ret = FALSE;
-
- memset(addr, 0, sizeof(*addr));
-
- if (*p == '<' && len > 1) {
- /* Angled address */
- addr->addr_len = rspamd_memcspn(p + 1, ">", len - 1);
- addr->addr = p + 1;
- addr->raw = p;
- addr->raw_len = len;
- ret = TRUE;
-
- p = p + 1;
- len = addr->addr_len;
- end = p + len;
- }
- else if (len > 0) {
- addr->addr = p;
- addr->addr_len = len;
- addr->raw = p;
- addr->raw_len = len;
- ret = TRUE;
- }
-
- if (ret) {
- at = rspamd_memrchr(p, '@', len);
-
- if (at != NULL && at + 1 < end) {
- addr->domain = at + 1;
- addr->domain_len = end - (at + 1);
- addr->user = p;
- addr->user_len = at - p;
- }
-
- if (rspamd_str_has_8bit(p, len)) {
- addr->flags |= RSPAMD_EMAIL_ADDR_HAS_8BIT;
- }
- }
-
- return ret;
- }
-
- static inline int
- rspamd_email_address_check_and_add(const char *start, gsize len,
- GPtrArray *res,
- rspamd_mempool_t *pool,
- GString *ns,
- int max_elements)
- {
- struct rspamd_email_address addr;
-
- g_assert(res != NULL);
-
- if (max_elements > 0 && res->len >= max_elements) {
- msg_info_pool_check("reached maximum number of elements %d when adding %v",
- max_elements,
- ns);
-
- return -1;
- }
-
- /* The whole email is likely address */
- memset(&addr, 0, sizeof(addr));
- rspamd_smtp_addr_parse(start, len, &addr);
-
- if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
- rspamd_email_address_add(pool, res, &addr, ns);
- }
- else {
- /* Try heuristic */
- if (rspamd_email_address_parse_heuristic(start,
- len, &addr)) {
- rspamd_email_address_add(pool, res, &addr, ns);
-
- return 1;
- }
- else {
- return 0;
- }
- }
-
- return 1;
- }
-
- GPtrArray *
- rspamd_email_address_from_mime(rspamd_mempool_t *pool, const char *hdr,
- unsigned int len,
- GPtrArray *src,
- int max_elements)
- {
- GPtrArray *res = src;
- gboolean seen_at = FALSE, seen_obrace = FALSE;
-
- const char *p = hdr, *end = hdr + len, *c = hdr, *t;
- GString *ns, *cpy;
- int obraces, ebraces;
- enum {
- parse_name = 0,
- parse_quoted,
- parse_addr,
- skip_spaces
- } state = parse_name,
- next_state = parse_name;
-
- if (res == NULL) {
- res = g_ptr_array_sized_new(2);
- rspamd_mempool_add_destructor(pool, rspamd_email_address_list_destroy,
- res);
- }
- else if (max_elements > 0 && res->len >= max_elements) {
- msg_info_pool_check("reached maximum number of elements %d", max_elements);
-
- return res;
- }
-
- ns = g_string_sized_new(len);
- cpy = g_string_sized_new(len);
-
- rspamd_mempool_add_destructor(pool, rspamd_gstring_free_hard, cpy);
-
- /* First, we need to remove all comments as they are terrible */
- obraces = 0;
- ebraces = 0;
-
- while (p < end) {
- if (state == parse_name) {
- if (*p == '\\') {
- if (obraces == 0) {
- g_string_append_c(cpy, *p);
- }
-
- p++;
- }
- else {
- if (*p == '"') {
- state = parse_quoted;
- }
- else if (*p == '(') {
- obraces++; /* To avoid ) itself being copied */
- }
- else if (*p == ')') {
- ebraces++;
- p++;
- }
-
- if (obraces == ebraces) {
- obraces = 0;
- ebraces = 0;
- }
- }
-
- if (p < end && obraces == 0) {
- g_string_append_c(cpy, *p);
- }
- }
- else {
- /* Quoted elt */
- if (*p == '\\') {
- g_string_append_c(cpy, *p);
- p++;
- }
- else {
- if (*p == '"') {
- state = parse_name;
- }
- }
-
- if (p < end) {
- g_string_append_c(cpy, *p);
- }
- }
-
- p++;
- }
-
- state = parse_name;
-
- p = cpy->str;
- c = p;
- end = p + cpy->len;
-
- while (p < end) {
- switch (state) {
- case parse_name:
- if (*p == '"') {
- /* We need to strip last spaces and update `ns` */
- if (p > c) {
- unsigned int nspaces = 0;
-
- t = p - 1;
-
- while (t > c && g_ascii_isspace(*t)) {
- t--;
- nspaces++;
- }
-
- g_string_append_len(ns, c, t - c + 1);
-
- if (nspaces > 0) {
- g_string_append_c(ns, ' ');
- }
- }
-
- state = parse_quoted;
- c = p + 1;
- }
- else if (*p == '<') {
- if (p > c) {
- t = p - 1;
-
- while (t > c && g_ascii_isspace(*t)) {
- t--;
- }
-
- g_string_append_len(ns, c, t - c + 1);
- }
-
- c = p;
- state = parse_addr;
- }
- else if (*p == ',') {
- if (p > c && seen_at) {
- /*
- * Last token must be the address:
- * e.g. Some name name@domain.com
- */
- t = p - 1;
-
- while (t > c && g_ascii_isspace(*t)) {
- t--;
- }
-
- int check = rspamd_email_address_check_and_add(c, t - c + 1,
- res, pool, ns, max_elements);
-
- if (check == 0 && res->len == 0) {
- /* Insert fake address */
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- else if (check != 1) {
- goto end;
- }
-
- /* Cleanup for the next use */
- g_string_set_size(ns, 0);
- seen_at = FALSE;
- }
-
- state = skip_spaces;
- next_state = parse_name;
- }
- else if (*p == '@') {
- seen_at = TRUE;
- }
-
- p++;
- break;
- case parse_quoted:
- if (*p == '\\') {
- if (p > c) {
- g_string_append_len(ns, c, p - c);
- }
-
- p++;
- c = p;
- }
- else if (*p == '"') {
- if (p > c) {
- g_string_append_len(ns, c, p - c);
- }
-
- if (p + 1 < end && g_ascii_isspace(p[1])) {
- g_string_append_c(ns, ' ');
- }
-
- state = skip_spaces;
- next_state = parse_name;
- }
- else if (*p == '@' && seen_obrace) {
- seen_at = TRUE;
- }
- else if (*p == '<') {
- seen_obrace = TRUE;
- }
- p++;
- break;
- case parse_addr:
- if (*p == '>') {
- int check = rspamd_email_address_check_and_add(c, p - c + 1,
- res, pool, ns, max_elements);
- if (check == 0 && res->len == 0) {
- /* Insert a fake address */
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- else if (check != 1) {
- goto end;
- }
-
- /* Cleanup for the next use */
- g_string_set_size(ns, 0);
- seen_at = FALSE;
- state = skip_spaces;
- next_state = parse_name;
- }
- else if (*p == '@') {
- seen_at = TRUE;
- }
- p++;
- break;
- case skip_spaces:
- if (!g_ascii_isspace(*p)) {
- c = p;
- state = next_state;
- }
- else {
- p++;
- }
- break;
- }
- }
-
- /* Handle leftover */
- switch (state) {
- case parse_name:
- /* Assume the whole header as name (bad thing) */
- if (p > c) {
- while (p > c && g_ascii_isspace(*p)) {
- p--;
- }
-
- if (p > c) {
- if (seen_at) {
- /* The whole email is likely address */
- int check = rspamd_email_address_check_and_add(c, p - c,
- res, pool, ns, max_elements);
- if (check == 0 && res->len == 0) {
- /* Insert a fake address */
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- else if (check != 1) {
- goto end;
- }
- }
- else {
- /* No @ seen */
- g_string_append_len(ns, c, p - c);
-
- if (res->len == 0) {
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- }
- }
- else if (res->len == 0) {
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- }
- break;
- case parse_addr:
- if (p > c) {
- if (rspamd_email_address_check_and_add(c, p - c,
- res, pool, ns, max_elements) == 0) {
- if (res->len == 0) {
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- }
- }
- break;
- case parse_quoted:
- /* Unfinished quoted string or a comment */
- /* If we have seen obrace + at, then we still can try to resolve address */
- if (seen_at && seen_obrace) {
- p = rspamd_memrchr(cpy->str, '<', cpy->len);
- g_assert(p != NULL);
- if (rspamd_email_address_check_and_add(p, end - p,
- res, pool, ns, max_elements) == 0) {
- if (res->len == 0) {
- rspamd_email_address_add(pool, res, NULL, ns);
- }
- }
- }
- break;
- default:
- /* Do nothing */
- break;
- }
- end:
- rspamd_mempool_notify_alloc(pool, cpy->len);
- g_string_free(ns, TRUE);
-
- return res;
- }
-
- void rspamd_email_address_list_destroy(gpointer ptr)
- {
- GPtrArray *ar = ptr;
- unsigned int i;
- struct rspamd_email_address *addr;
-
- PTR_ARRAY_FOREACH(ar, i, addr)
- {
- rspamd_email_address_free(addr);
- }
-
- g_ptr_array_free(ar, TRUE);
- }
|