]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Rework received headers parsing to C++
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 3 Oct 2021 11:52:45 +0000 (12:52 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 3 Oct 2021 11:52:45 +0000 (12:52 +0100)
src/libmime/CMakeLists.txt
src/libmime/email_addr.h
src/libmime/message.h
src/libmime/mime_headers.c
src/libmime/mime_headers.h
src/libmime/received.cxx [new file with mode: 0644]
src/libmime/received.h [new file with mode: 0644]
src/libmime/smtp_parsers.h

index 878ac814908c4d816a3539e79a9024b75a4cd963..4a64aac58cc883ff49fd48b2d7baa122801bf0c0 100644 (file)
@@ -1,5 +1,6 @@
 # Librspamd mime
 SET(LIBRSPAMDMIMESRC
+               ${CMAKE_CURRENT_SOURCE_DIR}/received.cxx
                                ${CMAKE_CURRENT_SOURCE_DIR}/email_addr.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_expressions.c
         ${CMAKE_CURRENT_SOURCE_DIR}/scan_result.c
@@ -11,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
-               ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx)
+               ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
+               )
 
 SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file
index fe9fd9e9dadec1f566e5af19c7afedad401f9aac..7e150f80d75fa4a3dc75bc119fef36a8e23d7b0e 100644 (file)
@@ -58,7 +58,6 @@ struct rspamd_email_address {
        guint flags;
 };
 
-struct rspamd_received_header;
 struct rspamd_task;
 
 /**
index a391daf0d673287994ad3ff9dae3167ea2a4f72b..d5329efa705bffac65f6af5699f20837125ad21c 100644 (file)
@@ -174,7 +174,7 @@ struct rspamd_message {
        GPtrArray *parts;                               /**< list of parsed parts                                                       */
        GPtrArray *text_parts;                  /**< list of text parts                                                         */
        struct rspamd_message_raw_headers_content raw_headers_content;
-       struct rspamd_received_header *received;        /**< list of received headers                                           */
+       void *received_headers;                 /**< list of received headers                                           */
        khash_t (rspamd_url_hash) *urls;
        struct rspamd_mime_headers_table *raw_headers;  /**< list of raw headers                                                */
        struct rspamd_mime_header *headers_order;       /**< order of raw headers                                                       */
index 7b5011be4255a9b5d70e8d2cb074a61c224eb537..7afb0e7a6c2aafbcfc64d441145ea17f60e139cd 100644 (file)
@@ -17,9 +17,9 @@
 #include "mime_headers.h"
 #include "smtp_parsers.h"
 #include "mime_encoding.h"
+#include "received.h"
 #include "contrib/uthash/utlist.h"
 #include "libserver/mempool_vars_internal.h"
-#include "libserver/url.h"
 #include "libserver/cfg_file.h"
 #include "libutil/util.h"
 #include <unicode/utf8.h>
@@ -33,9 +33,6 @@ struct rspamd_mime_headers_table {
        ref_entry_t ref;
 };
 
-#define RSPAMD_INET_ADDRESS_PARSE_RECEIVED \
-       (RSPAMD_INET_ADDRESS_PARSE_REMOTE|RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)
-
 static void
 rspamd_mime_header_check_special (struct rspamd_task *task,
                struct rspamd_mime_header *rh)
@@ -913,794 +910,6 @@ rspamd_mime_message_id_generate (const gchar *fqdn)
        return g_string_free (out, FALSE);
 }
 
-enum rspamd_received_part_type {
-       RSPAMD_RECEIVED_PART_FROM,
-       RSPAMD_RECEIVED_PART_BY,
-       RSPAMD_RECEIVED_PART_FOR,
-       RSPAMD_RECEIVED_PART_WITH,
-       RSPAMD_RECEIVED_PART_ID,
-       RSPAMD_RECEIVED_PART_UNKNOWN,
-};
-
-struct rspamd_received_comment {
-       gchar *data;
-       gsize dlen;
-       struct rspamd_received_comment *prev;
-};
-
-struct rspamd_received_part {
-       enum rspamd_received_part_type type;
-       gchar *data;
-       gsize dlen;
-       struct rspamd_received_comment *tail_comment;
-       struct rspamd_received_comment *head_comment;
-       struct rspamd_received_part *prev, *next;
-};
-
-static void
-rspamd_smtp_received_part_set_or_append (struct rspamd_task *task,
-                                                                                const gchar *begin,
-                                                                                gsize len,
-                                                                                gchar **dest,
-                                                                                gsize *destlen)
-{
-       if (len == 0) {
-               return;
-       }
-
-       if (*dest) {
-               /* Append */
-               gsize total_len = *destlen + len;
-               gchar *new_dest;
-
-               new_dest = rspamd_mempool_alloc (task->task_pool, total_len);
-               memcpy (new_dest, *dest, *destlen);
-               memcpy (new_dest + *destlen, begin, len);
-               rspamd_str_lc (new_dest + *destlen, len);
-               *dest = new_dest;
-               *destlen = total_len;
-       }
-       else {
-               /* Set */
-               *dest = rspamd_mempool_alloc (task->task_pool, len);
-               memcpy (*dest, begin, len);
-               rspamd_str_lc (*dest, len);
-               *dest = (gchar *)rspamd_string_len_strip (*dest, &len, " \t");
-               *destlen = len;
-       }
-}
-
-static struct rspamd_received_part *
-rspamd_smtp_received_process_part (struct rspamd_task *task,
-                                                                  const char *data,
-                                                                  size_t len,
-                                                                  enum rspamd_received_part_type type,
-                                                                  goffset *last)
-{
-       struct rspamd_received_part *npart;
-       const guchar *p, *c, *end;
-       guint obraces = 0, ebraces = 0;
-       gboolean seen_tcpinfo = FALSE;
-       enum _parse_state {
-               skip_spaces,
-               in_comment,
-               read_data,
-               read_tcpinfo,
-               all_done
-       } state, next_state;
-
-       npart = rspamd_mempool_alloc0 (task->task_pool, sizeof (*npart));
-       npart->type = type;
-
-       /* In this function, we just process comments and data separately */
-       p = data;
-       end = data + len;
-       c = data;
-       state = skip_spaces;
-       next_state = read_data;
-
-       while (p < end) {
-               switch (state) {
-               case skip_spaces:
-                       if (!g_ascii_isspace (*p)) {
-                               c = p;
-                               state = next_state;
-                       }
-                       else {
-                               p ++;
-                       }
-                       break;
-               case in_comment:
-                       if (*p == '(') {
-                               obraces ++;
-                       }
-                       else if (*p == ')') {
-                               ebraces ++;
-
-                               if (ebraces >= obraces) {
-                                       if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-                                               if (p > c) {
-                                                       struct rspamd_received_comment *comment;
-
-
-                                                       comment = rspamd_mempool_alloc0 (task->task_pool,
-                                                                       sizeof (*comment));
-                                                       rspamd_smtp_received_part_set_or_append (task,
-                                                                       c, p - c,
-                                                                       &comment->data, &comment->dlen);
-
-                                                       if (!npart->head_comment) {
-                                                               comment->prev = NULL;
-                                                               npart->head_comment = comment;
-                                                               npart->tail_comment = comment;
-                                                       }
-                                                       else {
-                                                               comment->prev = npart->tail_comment;
-                                                               npart->tail_comment = comment;
-                                                       }
-                                               }
-                                       }
-
-                                       p ++;
-                                       c = p;
-                                       state = skip_spaces;
-                                       next_state = read_data;
-
-                                       continue;
-                               }
-                       }
-
-                       p ++;
-                       break;
-               case read_data:
-                       if (*p == '(') {
-                               if (p > c) {
-                                       if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-                                               rspamd_smtp_received_part_set_or_append (task,
-                                                               c, p - c,
-                                                               &npart->data, &npart->dlen);
-                                       }
-                               }
-
-                               state = in_comment;
-                               obraces = 1;
-                               ebraces = 0;
-                               p ++;
-                               c = p;
-                       }
-                       else if (g_ascii_isspace (*p)) {
-                               if (p > c) {
-                                       if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-                                               rspamd_smtp_received_part_set_or_append (task,
-                                                               c, p - c,
-                                                               &npart->data, &npart->dlen);
-                                       }
-                               }
-
-                               state = skip_spaces;
-                               next_state = read_data;
-                               c = p;
-                       }
-                       else if (*p == ';') {
-                               /* It is actually delimiter of date part if not in the comments */
-                               if (p > c) {
-                                       if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-                                               rspamd_smtp_received_part_set_or_append (task,
-                                                               c, p - c,
-                                                               &npart->data, &npart->dlen);
-                                       }
-                               }
-
-                               state = all_done;
-                               continue;
-                       }
-                       else if (npart->dlen > 0) {
-                               /* We have already received data and find something with no ( */
-                               if (!seen_tcpinfo && type == RSPAMD_RECEIVED_PART_FROM) {
-                                       /* Check if we have something special here, such as TCPinfo */
-                                       if (*c == '[') {
-                                               state = read_tcpinfo;
-                                               p ++;
-                                       }
-                                       else {
-                                               state = all_done;
-                                               continue;
-                                       }
-                               }
-                               else {
-                                       state = all_done;
-                                       continue;
-                               }
-                       }
-                       else {
-                               p ++;
-                       }
-                       break;
-               case read_tcpinfo:
-                       if (*p == ']') {
-                               rspamd_smtp_received_part_set_or_append (task,
-                                               c, p - c + 1,
-                                               &npart->data, &npart->dlen);
-                               seen_tcpinfo = TRUE;
-                               state = skip_spaces;
-                               next_state = read_data;
-                               c = p;
-                       }
-                       p ++;
-                       break;
-               case all_done:
-                       if (p > (const guchar *)data) {
-                               *last = p - (const guchar *) data;
-                               return npart;
-                       }
-                       else {
-                               /* Empty element */
-                               return NULL;
-                       }
-                       break;
-               }
-       }
-
-       /* Leftover */
-       switch (state) {
-       case read_data:
-               if (p > c) {
-                       if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-                               rspamd_smtp_received_part_set_or_append (task,
-                                               c, p - c,
-                                               &npart->data, &npart->dlen);
-                       }
-
-                       *last = p - (const guchar *)data;
-
-                       return npart;
-               }
-               break;
-       case skip_spaces:
-               if (p > (const guchar *)data) {
-                       *last = p - (const guchar *) data;
-
-                       return npart;
-               }
-       default:
-               break;
-       }
-
-       return NULL;
-}
-
-static struct rspamd_received_part *
-rspamd_smtp_received_spill (struct rspamd_task *task,
-                                                       const char *data,
-                                                       size_t len,
-                                                       goffset *date_pos)
-{
-       const guchar *p, *end;
-       struct rspamd_received_part *cur_part, *head = NULL;
-       goffset pos = 0;
-
-       p = data;
-       end = data + len;
-
-       while (p < end && g_ascii_isspace (*p)) {
-               p ++;
-       }
-
-       len = end - p;
-
-       /* Ignore all received but those started from from part */
-       if (len <= 4 || (lc_map[p[0]] != 'f' &&
-                                        lc_map[p[1]] != 'r' &&
-                                        lc_map[p[2]] != 'o' &&
-                                        lc_map[p[3]] != 'm')) {
-               return NULL;
-       }
-
-       p += sizeof ("from") - 1;
-
-       /* We can now store from part */
-       cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-                       RSPAMD_RECEIVED_PART_FROM, &pos);
-
-       if (!cur_part) {
-               return NULL;
-       }
-
-       g_assert (pos != 0);
-       p += pos;
-       len = end > p ? end - p : 0;
-       DL_APPEND (head, cur_part);
-
-       if (len > 2 && (lc_map[p[0]] == 'b' &&
-                                       lc_map[p[1]] == 'y')) {
-               p += sizeof ("by") - 1;
-
-               cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-                               RSPAMD_RECEIVED_PART_BY, &pos);
-
-               if (!cur_part) {
-                       return NULL;
-               }
-
-               g_assert (pos != 0);
-               p += pos;
-               len = end > p ? end - p : 0;
-               DL_APPEND (head, cur_part);
-       }
-
-       while (p < end) {
-               if (*p == ';') {
-                       /* We are at the date separator, stop here */
-                       *date_pos = p - (const guchar *)data + 1;
-                       break;
-               }
-               else {
-                       if (len > sizeof ("with") && (lc_map[p[0]] == 'w' &&
-                                                                                 lc_map[p[1]] == 'i' &&
-                                                                                 lc_map[p[2]] == 't' &&
-                                                                                 lc_map[p[3]] == 'h')) {
-                               p += sizeof ("with") - 1;
-
-                               cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-                                               RSPAMD_RECEIVED_PART_WITH, &pos);
-                       }
-                       else if (len > sizeof ("for") && (lc_map[p[0]] == 'f' &&
-                                                                                         lc_map[p[1]] == 'o' &&
-                                                                                         lc_map[p[2]] == 'r')) {
-                               p += sizeof ("for") - 1;
-                               cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-                                               RSPAMD_RECEIVED_PART_FOR, &pos);
-                       }
-                       else if (len > sizeof ("id") && (lc_map[p[0]] == 'i' &&
-                                                                                         lc_map[p[1]] == 'd')) {
-                               p += sizeof ("id") - 1;
-                               cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-                                               RSPAMD_RECEIVED_PART_ID, &pos);
-                       }
-                       else {
-                               while (p < end) {
-                                       if (!(g_ascii_isspace (*p) || *p == '(' || *p == ';')) {
-                                               p ++;
-                                       }
-                                       else {
-                                               break;
-                                       }
-                               }
-
-                               if (p == end) {
-                                       return NULL;
-                               }
-                               else if (*p == ';') {
-                                       *date_pos = p - (const guchar *)data + 1;
-                                       break;
-                               }
-                               else {
-                                       cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-                                                       RSPAMD_RECEIVED_PART_UNKNOWN, &pos);
-                               }
-                       }
-
-                       if (!cur_part) {
-                               p ++;
-                               len = end > p ? end - p : 0;
-                       }
-                       else {
-                               g_assert (pos != 0);
-                               p += pos;
-                               len = end > p ? end - p : 0;
-                               DL_APPEND (head, cur_part);
-                       }
-               }
-       }
-
-       return head;
-}
-
-static gboolean
-rspamd_smtp_received_process_rdns (struct rspamd_task *task,
-                                                                  const gchar *begin,
-                                                                  gsize len,
-                                                                  const gchar **pdest)
-{
-       const gchar *p, *end;
-       gsize hlen = 0;
-       gboolean seen_dot = FALSE;
-
-       p = begin;
-       end = begin + len;
-
-       if (len == 0) {
-               return FALSE;
-       }
-
-       if (*p == '[' && *(end - 1) == ']' && len > 2) {
-               /* We have enclosed ip address */
-               rspamd_inet_addr_t  *addr = rspamd_parse_inet_address_pool (p + 1,
-                               (end - p) - 2,
-                               task->task_pool,
-                               RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-               if (addr) {
-                       const gchar *addr_str;
-                       gchar *dest;
-
-                       if (rspamd_inet_address_get_port (addr) != 0) {
-                               addr_str = rspamd_inet_address_to_string_pretty (addr);
-                       }
-                       else {
-                               addr_str = rspamd_inet_address_to_string (addr);
-                       }
-                       dest = rspamd_mempool_strdup (task->task_pool, addr_str);
-                       *pdest = dest;
-
-                       return TRUE;
-               }
-       }
-
-       while (p < end) {
-               if (!g_ascii_isspace (*p) && rspamd_url_is_domain (*p)) {
-                       if (*p == '.') {
-                               seen_dot = TRUE;
-                       }
-
-                       hlen ++;
-               }
-               else {
-                       break;
-               }
-
-               p ++;
-       }
-
-       if (hlen > 0) {
-               if (p == end) {
-                       /* All data looks like a hostname */
-                       gchar *dest;
-
-                       dest = rspamd_mempool_alloc (task->task_pool,
-                                       hlen + 1);
-                       rspamd_strlcpy (dest, begin, hlen + 1);
-                       *pdest = dest;
-
-                       return TRUE;
-               }
-               else if (seen_dot && (g_ascii_isspace (*p) || *p == '[' || *p == '(')) {
-                       gchar *dest;
-
-                       dest = rspamd_mempool_alloc (task->task_pool,
-                                       hlen + 1);
-                       rspamd_strlcpy (dest, begin, hlen + 1);
-                       *pdest = dest;
-
-                       return TRUE;
-               }
-       }
-
-       return FALSE;
-}
-
-static gboolean
-rspamd_smtp_received_process_host_tcpinfo (struct rspamd_task *task,
-                                                                                  struct rspamd_received_header *rh,
-                                                                                  const gchar *data,
-                                                                                  gsize len)
-{
-       rspamd_inet_addr_t *addr = NULL;
-       gboolean ret = FALSE;
-
-       if (data[0] == '[') {
-               /* Likely Exim version */
-
-               const gchar *brace_pos = memchr (data, ']', len);
-
-               if (brace_pos) {
-                       addr = rspamd_parse_inet_address_pool (data + 1,
-                                       brace_pos - data - 1,
-                                       task->task_pool,
-                                       RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-                       if (addr) {
-                               rh->addr = addr;
-                               rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-                                               rspamd_inet_address_to_string (addr));
-                               rh->from_ip = rh->real_ip;
-                       }
-               }
-       }
-       else {
-               if (g_ascii_isxdigit (data[0])) {
-                       /* Try to parse IP address */
-                       addr = rspamd_parse_inet_address_pool (data,
-                                       len, task->task_pool, RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-                       if (addr) {
-                               rh->addr = addr;
-                               rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-                                               rspamd_inet_address_to_string (addr));
-                               rh->from_ip = rh->real_ip;
-                       }
-               }
-
-               if (!addr) {
-                       /* Try canonical Postfix version: rdns [ip] */
-                       const gchar *obrace_pos = memchr (data, '[', len),
-                                       *ebrace_pos, *dend;
-
-                       if (obrace_pos) {
-                               dend = data + len;
-                               ebrace_pos = memchr (obrace_pos, ']', dend - obrace_pos);
-
-                               if (ebrace_pos) {
-                                       addr = rspamd_parse_inet_address_pool (obrace_pos + 1,
-                                                       ebrace_pos - obrace_pos - 1,
-                                                       task->task_pool,
-                                                       RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-                                       if (addr) {
-                                               rh->addr = addr;
-                                               rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-                                                               rspamd_inet_address_to_string (addr));
-                                               rh->from_ip = rh->real_ip;
-
-                                               /* Process with rDNS */
-                                               if (rspamd_smtp_received_process_rdns (task,
-                                                               data,
-                                                               obrace_pos - data,
-                                                               &rh->real_hostname)) {
-                                                       ret = TRUE;
-                                               }
-                                       }
-                               }
-                       }
-                       else {
-                               /* Hostname or some crap, sigh... */
-                               if (rspamd_smtp_received_process_rdns (task,
-                                               data,
-                                               len,
-                                               &rh->real_hostname)) {
-                                       ret = TRUE;
-                               }
-                       }
-               }
-       }
-
-       return ret;
-}
-
-static void
-rspamd_smtp_received_process_from (struct rspamd_task *task,
-                                                                  struct rspamd_received_part *rpart,
-                                                                  struct rspamd_received_header *rh)
-{
-       if (rpart->dlen > 0) {
-               /* We have seen multiple cases:
-                * - [ip] (hostname/unknown [real_ip])
-                * - helo (hostname/unknown [real_ip])
-                * - [ip]
-                * - hostname
-                * - hostname ([ip]:port helo=xxx)
-                * Maybe more...
-                */
-               gboolean seen_ip_in_data = FALSE;
-
-               if (rpart->head_comment && rpart->head_comment->dlen > 0) {
-                       /* We can have info within comment as part of RFC */
-                       rspamd_smtp_received_process_host_tcpinfo (
-                                       task, rh,
-                                       rpart->head_comment->data, rpart->head_comment->dlen);
-               }
-
-               if (!rh->real_ip) {
-                       if (rpart->data[0] == '[') {
-                               /* No comment, just something that looks like SMTP IP */
-                               const gchar *brace_pos = memchr (rpart->data, ']', rpart->dlen);
-                               rspamd_inet_addr_t *addr;
-
-                               if (brace_pos) {
-                                       addr = rspamd_parse_inet_address_pool (rpart->data + 1,
-                                                       brace_pos - rpart->data - 1,
-                                                       task->task_pool,
-                                                       RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-                                       if (addr) {
-                                               seen_ip_in_data = TRUE;
-                                               rh->addr = addr;
-                                               rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-                                                               rspamd_inet_address_to_string (addr));
-                                               rh->from_ip = rh->real_ip;
-                                       }
-                               }
-                       }
-                       else if (g_ascii_isxdigit (rpart->data[0])) {
-                               /* Try to parse IP address */
-                               rspamd_inet_addr_t *addr;
-                               addr = rspamd_parse_inet_address_pool (rpart->data,
-                                               rpart->dlen, task->task_pool,
-                                               RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-                               if (addr) {
-                                       seen_ip_in_data = TRUE;
-                                       rh->addr = addr;
-                                       rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-                                                       rspamd_inet_address_to_string (addr));
-                                       rh->from_ip = rh->real_ip;
-                               }
-                       }
-               }
-
-               if (!seen_ip_in_data) {
-                       if (rh->real_ip) {
-                               /* Get anounced hostname (usually helo) */
-                               rspamd_smtp_received_process_rdns (task,
-                                               rpart->data,
-                                               rpart->dlen,
-                                               &rh->from_hostname);
-                       }
-                       else {
-                               rspamd_smtp_received_process_host_tcpinfo (task,
-                                               rh, rpart->data, rpart->dlen);
-                       }
-               }
-       }
-       else {
-               /* rpart->dlen = 0 */
-
-               if (rpart->head_comment && rpart->head_comment->dlen > 0) {
-                       rspamd_smtp_received_process_host_tcpinfo (task,
-                                       rh,
-                                       rpart->head_comment->data,
-                                       rpart->head_comment->dlen);
-               }
-       }
-}
-
-int
-rspamd_smtp_received_parse (struct rspamd_task *task,
-                                                       const char *data,
-                                                       size_t len,
-                                                       struct rspamd_received_header *rh)
-{
-       goffset date_pos = -1;
-       struct rspamd_received_part *head, *cur;
-       rspamd_ftok_t t1, t2;
-
-       head = rspamd_smtp_received_spill (task, data, len, &date_pos);
-
-       if (head == NULL) {
-               return -1;
-       }
-
-       rh->flags = RSPAMD_RECEIVED_UNKNOWN;
-
-       DL_FOREACH (head, cur) {
-               switch (cur->type) {
-               case RSPAMD_RECEIVED_PART_FROM:
-                       rspamd_smtp_received_process_from (task, cur, rh);
-                       break;
-               case RSPAMD_RECEIVED_PART_BY:
-                       rspamd_smtp_received_process_rdns (task,
-                                       cur->data,
-                                       cur->dlen,
-                                       &rh->by_hostname);
-                       break;
-               case RSPAMD_RECEIVED_PART_WITH:
-                       t1.begin = cur->data;
-                       t1.len = cur->dlen;
-
-                       if (t1.len > 0) {
-                               RSPAMD_FTOK_ASSIGN (&t2, "smtp");
-
-                               if (rspamd_ftok_cmp (&t1, &t2) == 0) {
-                                       rh->flags = RSPAMD_RECEIVED_SMTP;
-                               }
-
-                               RSPAMD_FTOK_ASSIGN (&t2, "esmtp");
-
-                               if (rspamd_ftok_starts_with (&t1, &t2)) {
-                                       /*
-                                        * esmtp, esmtps, esmtpsa
-                                        */
-                                       if (t1.len == t2.len + 1) {
-                                               if (t1.begin[t2.len] == 'a') {
-                                                       rh->flags = RSPAMD_RECEIVED_ESMTPA;
-                                                       rh->flags |= RSPAMD_RECEIVED_FLAG_AUTHENTICATED;
-                                               }
-                                               else if (t1.begin[t2.len] == 's') {
-                                                       rh->flags = RSPAMD_RECEIVED_ESMTPS;
-                                                       rh->flags |= RSPAMD_RECEIVED_FLAG_SSL;
-                                               }
-                                               continue;
-                                       }
-                                       else if (t1.len == t2.len + 2) {
-                                               if (t1.begin[t2.len] == 's' &&
-                                                               t1.begin[t2.len + 1] == 'a') {
-                                                       rh->flags = RSPAMD_RECEIVED_ESMTPSA;
-                                                       rh->flags |= RSPAMD_RECEIVED_FLAG_AUTHENTICATED;
-                                                       rh->flags |= RSPAMD_RECEIVED_FLAG_SSL;
-                                               }
-                                               continue;
-                                       }
-                                       else if (t1.len == t2.len) {
-                                               rh->flags = RSPAMD_RECEIVED_ESMTP;
-                                               continue;
-                                       }
-                               }
-
-                               RSPAMD_FTOK_ASSIGN (&t2, "lmtp");
-
-                               if (rspamd_ftok_cmp (&t1, &t2) == 0) {
-                                       rh->flags = RSPAMD_RECEIVED_LMTP;
-                                       continue;
-                               }
-
-                               RSPAMD_FTOK_ASSIGN (&t2, "imap");
-
-                               if (rspamd_ftok_cmp (&t1, &t2) == 0) {
-                                       rh->flags = RSPAMD_RECEIVED_IMAP;
-                                       continue;
-                               }
-
-                               RSPAMD_FTOK_ASSIGN (&t2, "local");
-
-                               if (rspamd_ftok_cmp (&t1, &t2) == 0) {
-                                       rh->flags = RSPAMD_RECEIVED_LOCAL;
-                                       continue;
-                               }
-
-                               RSPAMD_FTOK_ASSIGN (&t2, "http");
-
-                               if (rspamd_ftok_starts_with (&t1, &t2)) {
-                                       if (t1.len == t2.len + 1) {
-                                               if (t1.begin[t2.len] == 's') {
-                                                       rh->flags = RSPAMD_RECEIVED_HTTP;
-                                                       rh->flags |= RSPAMD_RECEIVED_FLAG_SSL;
-                                               }
-                                       }
-                                       else if (t1.len == t2.len) {
-                                               rh->flags = RSPAMD_RECEIVED_HTTP;
-                                       }
-
-                                       continue;
-                               }
-                       }
-
-                       break;
-               case RSPAMD_RECEIVED_PART_FOR:
-                       rh->for_addr = rspamd_email_address_from_smtp (cur->data, cur->dlen);
-
-                       if (rh->for_addr) {
-                               if (rh->for_addr->addr_len > 0) {
-                                       t1.begin = rh->for_addr->addr;
-                                       t1.len = rh->for_addr->addr_len;
-                                       rh->for_mbox = rspamd_mempool_ftokdup (task->task_pool,
-                                                       &t1);
-                               }
-
-                               rspamd_mempool_add_destructor (task->task_pool,
-                                               (rspamd_mempool_destruct_t)rspamd_email_address_free,
-                                               rh->for_addr);
-                       }
-                       break;
-               default:
-                       /* Do nothing */
-                       break;
-               }
-       }
-
-       if (rh->real_ip && !rh->from_ip) {
-               rh->from_ip = rh->real_ip;
-       }
-
-       if (rh->real_hostname && !rh->from_hostname) {
-               rh->from_hostname = rh->real_hostname;
-       }
-
-       if (date_pos > 0 && date_pos < len) {
-               rh->timestamp = rspamd_parse_smtp_date (data + date_pos,
-                               len - date_pos, NULL);
-       }
-
-       return 0;
-}
-
 struct rspamd_mime_header *
 rspamd_message_get_header_from_hash (struct rspamd_mime_headers_table *hdrs,
                                                                         const gchar *field,
index f24b0d6c6bc079df521243b33c619e5300a510db..07a64c31ee1753ebe82af4d62a3beb9637c016fd 100644 (file)
@@ -72,52 +72,6 @@ struct rspamd_mime_header {
 
 struct rspamd_mime_headers_table;
 
-enum rspamd_received_type {
-       RSPAMD_RECEIVED_SMTP = 1u << 0u,
-       RSPAMD_RECEIVED_ESMTP = 1u << 1u,
-       RSPAMD_RECEIVED_ESMTPA = 1u << 2u,
-       RSPAMD_RECEIVED_ESMTPS = 1u << 3u,
-       RSPAMD_RECEIVED_ESMTPSA = 1u << 4u,
-       RSPAMD_RECEIVED_LMTP = 1u << 5u,
-       RSPAMD_RECEIVED_IMAP = 1u << 6u,
-       RSPAMD_RECEIVED_LOCAL = 1u << 7u,
-       RSPAMD_RECEIVED_HTTP = 1u << 8u,
-       RSPAMD_RECEIVED_MAPI = 1u << 9u,
-       RSPAMD_RECEIVED_UNKNOWN = 1u << 10u,
-       RSPAMD_RECEIVED_FLAG_ARTIFICIAL =  (1u << 11u),
-       RSPAMD_RECEIVED_FLAG_SSL =  (1u << 12u),
-       RSPAMD_RECEIVED_FLAG_AUTHENTICATED =  (1u << 13u),
-};
-
-#define RSPAMD_RECEIVED_FLAG_TYPE_MASK (RSPAMD_RECEIVED_SMTP| \
-                       RSPAMD_RECEIVED_ESMTP| \
-                       RSPAMD_RECEIVED_ESMTPA| \
-                       RSPAMD_RECEIVED_ESMTPS| \
-                       RSPAMD_RECEIVED_ESMTPSA| \
-                       RSPAMD_RECEIVED_LMTP| \
-                       RSPAMD_RECEIVED_IMAP| \
-                       RSPAMD_RECEIVED_LOCAL| \
-                       RSPAMD_RECEIVED_HTTP| \
-                       RSPAMD_RECEIVED_MAPI| \
-                       RSPAMD_RECEIVED_UNKNOWN)
-
-struct rspamd_email_address;
-
-struct rspamd_received_header {
-       const gchar *from_hostname;
-       const gchar *from_ip;
-       const gchar *real_hostname;
-       const gchar *real_ip;
-       const gchar *by_hostname;
-       const gchar *for_mbox;
-       struct rspamd_email_address *for_addr;
-       rspamd_inet_addr_t *addr;
-       struct rspamd_mime_header *hdr;
-       time_t timestamp;
-       gint flags; /* See enum rspamd_received_type */
-       struct rspamd_received_header *prev, *next;
-};
-
 /**
  * Process headers and store them in `target`
  * @param task
diff --git a/src/libmime/received.cxx b/src/libmime/received.cxx
new file mode 100644 (file)
index 0000000..78c9f18
--- /dev/null
@@ -0,0 +1,745 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "received.h"
+#include "libserver/task.h"
+#include "libserver/url.h"
+#include "mime_string.hxx"
+#include "smtp_parsers.h"
+#include "message.h"
+
+#include <vector>
+#include <string_view>
+#include <utility>
+#include "frozen/string.h"
+#include "frozen/unordered_map.h"
+
+namespace rspamd::mime {
+
+enum class received_part_type {
+       RSPAMD_RECEIVED_PART_FROM,
+       RSPAMD_RECEIVED_PART_BY,
+       RSPAMD_RECEIVED_PART_FOR,
+       RSPAMD_RECEIVED_PART_WITH,
+       RSPAMD_RECEIVED_PART_ID,
+       RSPAMD_RECEIVED_PART_UNKNOWN,
+};
+
+static inline auto
+received_char_filter(UChar32 uc) -> UChar32
+{
+       if (u_isprint(uc)) {
+               return u_tolower(uc);
+       }
+
+       return 0;
+}
+
+
+struct received_header {
+       mime_string from_hostname;
+       std::string_view from_ip;
+       mime_string real_hostname;
+       mime_string real_ip;
+       mime_string by_hostname;
+       std::string_view for_mbox;
+       struct rspamd_email_address *for_addr = nullptr;
+       rspamd_inet_addr_t *addr = nullptr;
+       struct rspamd_mime_header *hdr = nullptr;
+       time_t timestamp = 0;
+       int flags = 0; /* See enum rspamd_received_type */
+
+       received_header() noexcept
+                       : from_hostname(received_char_filter),
+                         real_hostname(received_char_filter),
+                         real_ip(received_char_filter),
+                         by_hostname(received_char_filter),
+                         for_mbox(received_char_filter) {}
+
+       ~received_header() {
+               if (for_addr) {
+                       rspamd_email_address_free(for_addr);
+               }
+       }
+};
+
+class received_header_chain {
+public:
+       explicit received_header_chain(struct rspamd_task *_task) : task(_task) {
+               headers.reserve(2);
+               rspamd_mempool_add_destructor(task->task_pool,
+                               received_header_chain::received_header_chain_pool_dtor, this);
+       }
+
+       auto new_received() -> received_header & {
+               headers.emplace_back();
+               return headers.back();
+       }
+private:
+       static auto received_header_chain_pool_dtor(void *ptr) -> void {
+               delete static_cast<received_header_chain *>(ptr);
+       }
+       std::vector<received_header> headers;
+       struct rspamd_task *task;
+};
+
+struct received_part {
+       received_part_type type;
+       mime_string data;
+       std::vector<mime_string> comments;
+
+       explicit received_part(received_part_type t)
+                                                                 : type(t),
+                                                                       data(received_char_filter) {}
+};
+
+static inline auto
+received_part_set_or_append(struct rspamd_task *task,
+                                                                               const gchar *begin,
+                                                                               gsize len,
+                                                                               mime_string &dest) -> void
+{
+       if (len == 0) {
+               return;
+       }
+
+       dest.append(begin, len);
+       dest.trim(" \t");
+}
+
+static auto
+received_process_part(struct rspamd_task *task,
+                                         const std::string_view &data,
+                                         received_part_type type,
+                                         std::ptrdiff_t &last,
+                                         received_part &npart) -> bool
+{
+       auto obraces = 0, ebraces = 0;
+       auto seen_tcpinfo = false;
+       enum _parse_state {
+               skip_spaces,
+               in_comment,
+               read_data,
+               read_tcpinfo,
+               all_done
+       } state, next_state;
+
+       /* In this function, we just process comments and data separately */
+       const auto *p = data.data();
+       const auto *end = p + data.size();
+       const auto *c = p;
+
+       state = skip_spaces;
+       next_state = read_data;
+
+       while (p < end) {
+               switch (state) {
+               case skip_spaces:
+                       if (!g_ascii_isspace(*p)) {
+                               c = p;
+                               state = next_state;
+                       }
+                       else {
+                               p++;
+                       }
+                       break;
+               case in_comment:
+                       if (*p == '(') {
+                               obraces++;
+                       }
+                       else if (*p == ')') {
+                               ebraces++;
+
+                               if (ebraces >= obraces) {
+                                       if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+                                               if (p > c) {
+                                                       npart.comments.emplace_back(received_char_filter);
+                                                       auto &comment = npart.comments.back();
+                                                       received_part_set_or_append(task,
+                                                                       c, p - c,
+                                                                       comment);
+                                               }
+                                       }
+
+                                       p++;
+                                       c = p;
+                                       state = skip_spaces;
+                                       next_state = read_data;
+
+                                       continue;
+                               }
+                       }
+
+                       p++;
+                       break;
+               case read_data:
+                       if (*p == '(') {
+                               if (p > c) {
+                                       if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+                                               received_part_set_or_append(task,
+                                                               c, p - c,
+                                                               npart.data);
+                                       }
+                               }
+
+                               state = in_comment;
+                               obraces = 1;
+                               ebraces = 0;
+                               p++;
+                               c = p;
+                       }
+                       else if (g_ascii_isspace (*p)) {
+                               if (p > c) {
+                                       if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+                                               received_part_set_or_append(task,
+                                                               c, p - c,
+                                                               npart.data);
+                                       }
+                               }
+
+                               state = skip_spaces;
+                               next_state = read_data;
+                               c = p;
+                       }
+                       else if (*p == ';') {
+                               /* It is actually delimiter of date part if not in the comments */
+                               if (p > c) {
+                                       if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+                                               received_part_set_or_append(task,
+                                                               c, p - c,
+                                                               npart.data);
+                                       }
+                               }
+
+                               state = all_done;
+                               continue;
+                       }
+                       else if (npart.data.size() > 0) {
+                               /* We have already received data and find something with no ( */
+                               if (!seen_tcpinfo && type == received_part_type::RSPAMD_RECEIVED_PART_FROM) {
+                                       /* Check if we have something special here, such as TCPinfo */
+                                       if (*c == '[') {
+                                               state = read_tcpinfo;
+                                               p++;
+                                       }
+                                       else {
+                                               state = all_done;
+                                               continue;
+                                       }
+                               }
+                               else {
+                                       state = all_done;
+                                       continue;
+                               }
+                       }
+                       else {
+                               p++;
+                       }
+                       break;
+               case read_tcpinfo:
+                       if (*p == ']') {
+                               received_part_set_or_append(task,
+                                               c, p - c + 1,
+                                               npart.data);
+                               seen_tcpinfo = TRUE;
+                               state = skip_spaces;
+                               next_state = read_data;
+                               c = p;
+                       }
+                       p++;
+                       break;
+               case all_done:
+                       if (p > data.data()) {
+                               last = p - data.data();
+                               return true;
+                       }
+                       else {
+                               /* Empty element */
+                               return false;
+                       }
+                       break;
+               }
+       }
+
+       /* Leftover */
+       switch (state) {
+       case read_data:
+               if (p > c) {
+                       if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+                               received_part_set_or_append(task,
+                                               c, p - c,
+                                               npart.data);
+                       }
+
+                       last = p - data.data();
+
+                       return true;
+               }
+               break;
+       case skip_spaces:
+               if (p > data.data()) {
+                       last = p - data.data();
+
+                       return true;
+               }
+       default:
+               break;
+       }
+
+       return false;
+}
+
+template <std::size_t N>
+constexpr auto lit_compare_lowercase(const char lit[N], const char *in) -> bool
+{
+       for (auto i = 0; i < N; i ++) {
+               if (lc_map[(unsigned char)in[i]] != lit[i]) {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+static auto
+received_spill(struct rspamd_task *task,
+                          const std::string_view &in,
+                          std::ptrdiff_t &date_pos) -> std::vector<received_part>
+{
+       std::vector<received_part> parts;
+       std::ptrdiff_t pos = 0;
+
+       const auto *p = in.data();
+       const auto *end = p + in.size();
+
+       while (p < end && g_ascii_isspace (*p)) {
+               p++;
+       }
+
+       auto len = end - p;
+
+       /* Ignore all received but those started from from part */
+       if (len <= 4 || !lit_compare_lowercase<4>("from", p)) {
+               return {};
+       }
+
+       p += sizeof("from") - 1;
+
+       auto maybe_process_part = [&](received_part_type what) -> bool {
+               parts.emplace_back(what);
+               auto &rcvd_part = parts.back();
+               auto chunk = std::string_view{p, (std::size_t)(end - p)};
+
+               if (!received_process_part(task, chunk, what, pos, rcvd_part)) {
+                       parts.pop_back();
+
+                       return false;
+               }
+
+               return true;
+       };
+
+       /* We can now store from part */
+       if (!maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_FROM)) {
+               return {};
+       }
+
+       g_assert (pos != 0);
+       p += pos;
+       len = end > p ? end - p : 0;
+
+       if (len > 2 && lit_compare_lowercase<2>("by", p)) {
+               p += sizeof("by") - 1;
+
+               if (!maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_BY)) {
+                       return {};
+               }
+
+               g_assert (pos != 0);
+               p += pos;
+               len = end > p ? end - p : 0;
+       }
+
+       while (p < end) {
+               bool got_part = false;
+               if (*p == ';') {
+                       /* We are at the date separator, stop here */
+                       date_pos = p - in.data() + 1;
+                       break;
+               }
+               else {
+                       if (len > sizeof("with") && lit_compare_lowercase<4>("with", p)) {
+                               p += sizeof("with") - 1;
+
+                               got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_WITH);
+                       }
+                       else if (len > sizeof("for") && lit_compare_lowercase<3>("for", p)) {
+                               p += sizeof("for") - 1;
+                               got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_FOR);
+                       }
+                       else if (len > sizeof("id") && lit_compare_lowercase<2>("id", p)) {
+                               p += sizeof("id") - 1;
+                               got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_ID);
+                       }
+                       else {
+                               while (p < end) {
+                                       if (!(g_ascii_isspace (*p) || *p == '(' || *p == ';')) {
+                                               p++;
+                                       }
+                                       else {
+                                               break;
+                                       }
+                               }
+
+                               if (p == end) {
+                                       return {};
+                               }
+                               else if (*p == ';') {
+                                       date_pos = p - in.data() + 1;
+                                       break;
+                               }
+                               else {
+                                       got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN);
+                               }
+                       }
+
+                       if (!got_part) {
+                               p++;
+                               len = end > p ? end - p : 0;
+                       }
+                       else {
+                               g_assert (pos != 0);
+                               p += pos;
+                               len = end > p ? end - p : 0;
+                       }
+               }
+       }
+
+       return parts;
+}
+
+#define RSPAMD_INET_ADDRESS_PARSE_RECEIVED \
+       (rspamd_inet_address_parse_flags)(RSPAMD_INET_ADDRESS_PARSE_REMOTE|RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)
+
+static auto
+received_process_rdns(struct rspamd_task *task,
+                                                                 const std::string_view &in,
+                                                                 mime_string &dest) -> bool
+{
+       auto seen_dot = false;
+
+       const auto *p = in.data();
+       const auto *end = p + in.size();
+
+       if (in.empty()) {
+               return false;
+       }
+
+       if (*p == '[' && *(end - 1) == ']' && in.size() > 2) {
+               /* We have enclosed ip address */
+               auto *addr = rspamd_parse_inet_address_pool(p + 1,
+                               (end - p) - 2,
+                               task->task_pool,
+                               RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+
+               if (addr) {
+                       const gchar *addr_str;
+
+                       if (rspamd_inet_address_get_port(addr) != 0) {
+                               addr_str = rspamd_inet_address_to_string_pretty(addr);
+                       }
+                       else {
+                               addr_str = rspamd_inet_address_to_string(addr);
+                       }
+
+                       dest.assign_copy(std::string_view{addr_str});
+
+                       return true;
+               }
+       }
+
+       auto hlen = 0u;
+
+       while (p < end) {
+               if (!g_ascii_isspace(*p) && rspamd_url_is_domain(*p)) {
+                       if (*p == '.') {
+                               seen_dot = true;
+                       }
+
+                       hlen++;
+               }
+               else {
+                       break;
+               }
+
+               p++;
+       }
+
+       if (hlen > 0) {
+               if (p == end || (seen_dot && (g_ascii_isspace(*p) || *p == '[' || *p == '('))) {
+                       /* All data looks like a hostname */
+                       dest.assign_copy(std::string_view{in.data(), hlen});
+
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static auto
+received_process_host_tcpinfo(struct rspamd_task *task,
+                                                         received_header &rh,
+                                                         const std::string_view &in) -> bool
+{
+       rspamd_inet_addr_t *addr = nullptr;
+       auto ret = false;
+
+       if (in.empty()) {
+               return false;
+       }
+
+       if (in[0] == '[') {
+               /* Likely Exim version */
+
+               auto brace_pos = in.find(']');
+
+               if (brace_pos != std::string_view::npos) {
+                       auto substr_addr = in.substr(1, brace_pos - 1);
+                       addr = rspamd_parse_inet_address_pool(substr_addr.data(),
+                                       substr_addr.size(),
+                                       task->task_pool,
+                                       RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+
+                       if (addr) {
+                               rh.addr = addr;
+                               rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr)));
+                               rh.from_ip = rh.real_ip.as_view();
+                       }
+               }
+       }
+       else {
+               if (g_ascii_isxdigit(in[0])) {
+                       /* Try to parse IP address */
+                       addr = rspamd_parse_inet_address_pool(in.data(),
+                                       in.size(), task->task_pool, RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+                       if (addr) {
+                               rh.addr = addr;
+                               rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr)));
+                               rh.from_ip = rh.real_ip.as_view();
+                       }
+               }
+
+               if (!addr) {
+                       /* Try canonical Postfix version: rdns [ip] */
+                       auto obrace_pos = in.find('[');
+
+                       if (obrace_pos != std::string_view::npos) {
+                               auto ebrace_pos = in.rfind(']', obrace_pos);
+
+                               if (ebrace_pos != std::string_view::npos) {
+                                       auto substr_addr = in.substr(obrace_pos + 1,
+                                                       ebrace_pos - obrace_pos - 1);
+                                       addr = rspamd_parse_inet_address_pool(substr_addr.data(),
+                                                       substr_addr.size(),
+                                                       task->task_pool,
+                                                       RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+
+                                       if (addr) {
+                                               rh.addr = addr;
+                                               rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr)));
+                                               rh.from_ip = rh.real_ip.as_view();
+
+                                               /* Process with rDNS */
+                                               auto rdns_substr = in.substr(0, obrace_pos);
+
+                                               if (received_process_rdns(task,
+                                                               rdns_substr,
+                                                               rh.real_hostname)) {
+                                                       ret = true;
+                                               }
+                                       }
+                               }
+                       }
+                       else {
+                               /* Hostname or some crap, sigh... */
+                               if (received_process_rdns(task, in, rh.real_hostname)) {
+                                       ret = true;
+                               }
+                       }
+               }
+       }
+
+       return ret;
+}
+
+static void
+received_process_from(struct rspamd_task *task,
+                                                                 const received_part &rpart,
+                                                                 received_header &rh)
+{
+       if (rpart.data.size() > 0) {
+               /* We have seen multiple cases:
+                * - [ip] (hostname/unknown [real_ip])
+                * - helo (hostname/unknown [real_ip])
+                * - [ip]
+                * - hostname
+                * - hostname ([ip]:port helo=xxx)
+                * Maybe more...
+                */
+               auto seen_ip_in_data = false;
+
+               if (!rpart.comments.empty()) {
+                       /* We can have info within comment as part of RFC */
+                       received_process_host_tcpinfo(
+                                       task, rh,
+                                       rpart.comments[0].as_view());
+               }
+
+               if (rh.real_ip.size() == 0) {
+                       /* Try to do the same with data */
+                       if (received_process_host_tcpinfo(
+                                       task, rh,
+                                       rpart.data.as_view())) {
+                               seen_ip_in_data = true;
+                       }
+               }
+
+               if (!seen_ip_in_data) {
+                       if (rh.real_ip.size() != 0) {
+                               /* Get anounced hostname (usually helo) */
+                               received_process_rdns(task,
+                                               rpart.data.as_view(),
+                                               rh.from_hostname);
+                       }
+                       else {
+                               received_process_host_tcpinfo(task,
+                                               rh, rpart.data.as_view());
+                       }
+               }
+       }
+       else {
+               /* rpart->dlen = 0 */
+               if (!rpart.comments.empty()) {
+                       received_process_host_tcpinfo(
+                                       task, rh,
+                                       rpart.comments[0].as_view());
+               }
+       }
+}
+
+auto
+received_header_parse(struct rspamd_task *task, const std::string_view &in,
+                                         struct rspamd_mime_header *hdr) -> bool
+{
+       std::ptrdiff_t date_pos = -1;
+
+       static constexpr const auto protos_map = frozen::make_unordered_map<frozen::string, int>({
+                       {"smtp",    RSPAMD_RECEIVED_SMTP},
+                       {"esmtp",   RSPAMD_RECEIVED_ESMTP},
+                       {"esmtpa",  RSPAMD_RECEIVED_ESMTPA | RSPAMD_RECEIVED_FLAG_AUTHENTICATED},
+                       {"esmtpsa", RSPAMD_RECEIVED_ESMTPSA | RSPAMD_RECEIVED_FLAG_SSL | RSPAMD_RECEIVED_FLAG_AUTHENTICATED},
+                       {"esmtps",  RSPAMD_RECEIVED_ESMTPS | RSPAMD_RECEIVED_FLAG_SSL},
+                       {"lmtp",    RSPAMD_RECEIVED_LMTP},
+                       {"imap",    RSPAMD_RECEIVED_IMAP},
+                       {"imaps",   RSPAMD_RECEIVED_IMAP | RSPAMD_RECEIVED_FLAG_SSL},
+                       {"http",    RSPAMD_RECEIVED_HTTP},
+                       {"https",   RSPAMD_RECEIVED_HTTP | RSPAMD_RECEIVED_FLAG_SSL},
+                       {"local",   RSPAMD_RECEIVED_LOCAL}
+       });
+
+       auto parts = received_spill(task, in, date_pos);
+
+       if (parts.empty()) {
+               return false;
+       }
+
+       auto *recv_chain_ptr = static_cast<received_header_chain *>(MESSAGE_FIELD(task, received_headers));
+
+       if (recv_chain_ptr == nullptr) {
+               /* This constructor automatically registers dtor in mempool */
+               recv_chain_ptr = new received_header_chain(task);
+               MESSAGE_FIELD(task, received_headers) = (void *)recv_chain_ptr;
+       }
+
+       auto &rh = recv_chain_ptr->new_received();
+
+       rh.flags = RSPAMD_RECEIVED_UNKNOWN;
+       rh.hdr = hdr;
+
+       for (const auto &part : parts) {
+               switch (part.type) {
+               case received_part_type::RSPAMD_RECEIVED_PART_FROM:
+                       received_process_from(task, part, rh);
+                       break;
+               case received_part_type::RSPAMD_RECEIVED_PART_BY:
+                       received_process_rdns(task,
+                                       part.data.as_view(),
+                                       rh.by_hostname);
+                       break;
+               case received_part_type::RSPAMD_RECEIVED_PART_WITH:
+                       if (part.data.size() > 0) {
+                               auto proto_flag_it = protos_map.find(part.data.as_view());
+
+                               if (proto_flag_it != protos_map.end()) {
+                                       rh.flags = proto_flag_it->second;
+                               }
+                       }
+                       break;
+               case received_part_type::RSPAMD_RECEIVED_PART_FOR:
+                       rh.for_addr = rspamd_email_address_from_smtp(part.data.data(),
+                                       part.data.size());
+
+                       if (rh.for_addr) {
+                               if (rh.for_addr->addr_len > 0) {
+                                       rh.for_mbox = std::string_view{rh.for_addr->addr,
+                                                                                                  rh.for_addr->addr_len};
+                               }
+                       }
+                       break;
+               default:
+                       /* Do nothing */
+                       break;
+               }
+       }
+
+       if (!rh.real_ip.empty() && rh.from_ip.empty()) {
+               rh.from_ip = rh.real_ip.as_view();
+       }
+
+       if (!rh.real_hostname.empty() && rh.from_hostname.empty()) {
+               rh.from_hostname.assign_copy(rh.real_hostname);
+       }
+
+       if (date_pos > 0 && date_pos < in.size()) {
+               auto date_sub = in.substr(date_pos);
+               rh.timestamp = rspamd_parse_smtp_date((const unsigned char*)date_sub.data(),
+                               date_sub.size(), nullptr);
+       }
+
+       return true;
+}
+
+} // namespace rspamd::mime
+
+bool
+rspamd_received_header_parse(struct rspamd_task *task,
+                                                        const char *data, size_t sz,
+                                                        struct rspamd_mime_header *hdr)
+{
+       return rspamd::mime::received_header_parse(task, std::string_view{data, sz}, hdr);
+}
diff --git a/src/libmime/received.h b/src/libmime/received.h
new file mode 100644 (file)
index 0000000..bc3c31e
--- /dev/null
@@ -0,0 +1,69 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef RSPAMD_RECEIVED_H
+#define RSPAMD_RECEIVED_H
+
+#include "config.h"
+#include "libutil/addr.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+enum rspamd_received_type {
+       RSPAMD_RECEIVED_SMTP = 1u << 0u,
+       RSPAMD_RECEIVED_ESMTP = 1u << 1u,
+       RSPAMD_RECEIVED_ESMTPA = 1u << 2u,
+       RSPAMD_RECEIVED_ESMTPS = 1u << 3u,
+       RSPAMD_RECEIVED_ESMTPSA = 1u << 4u,
+       RSPAMD_RECEIVED_LMTP = 1u << 5u,
+       RSPAMD_RECEIVED_IMAP = 1u << 6u,
+       RSPAMD_RECEIVED_LOCAL = 1u << 7u,
+       RSPAMD_RECEIVED_HTTP = 1u << 8u,
+       RSPAMD_RECEIVED_MAPI = 1u << 9u,
+       RSPAMD_RECEIVED_UNKNOWN = 1u << 10u,
+       RSPAMD_RECEIVED_FLAG_ARTIFICIAL = (1u << 11u),
+       RSPAMD_RECEIVED_FLAG_SSL = (1u << 12u),
+       RSPAMD_RECEIVED_FLAG_AUTHENTICATED = (1u << 13u),
+};
+
+#define RSPAMD_RECEIVED_FLAG_TYPE_MASK (RSPAMD_RECEIVED_SMTP| \
+            RSPAMD_RECEIVED_ESMTP| \
+            RSPAMD_RECEIVED_ESMTPA| \
+            RSPAMD_RECEIVED_ESMTPS| \
+            RSPAMD_RECEIVED_ESMTPSA| \
+            RSPAMD_RECEIVED_LMTP| \
+            RSPAMD_RECEIVED_IMAP| \
+            RSPAMD_RECEIVED_LOCAL| \
+            RSPAMD_RECEIVED_HTTP| \
+            RSPAMD_RECEIVED_MAPI| \
+            RSPAMD_RECEIVED_UNKNOWN)
+
+struct rspamd_email_address;
+struct rspamd_received_header_chain;
+struct rspamd_mime_header;
+
+bool rspamd_received_header_parse(struct rspamd_task *task,
+               const char *data, size_t sz, struct rspamd_mime_header *hdr);
+
+#ifdef  __cplusplus
+}
+#endif
+
+
+#endif //RSPAMD_RECEIVED_H
index 7eff6bf713730bebcb3e308e00d7b87f91c7039d..0d2c4044d2f54cea0ed7c119a5ed68f7ce202752 100644 (file)
 extern "C" {
 #endif
 
-int rspamd_smtp_received_parse (struct rspamd_task *task,
-                                                               const char *data, size_t len,
-                                                               struct rspamd_received_header *rh);
-
 int rspamd_smtp_addr_parse (const char *data, size_t len,
                                                        struct rspamd_email_address *addr);