From 7e6126e0169e6397d1e9e989433f590398d20fd8 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 14 Jun 2016 16:36:20 +0100 Subject: [PATCH] [Feature] Implement ragel parser for received headers --- src/ragel/smtp_addr_parser.rl | 5 + src/ragel/smtp_address.rl | 19 +-- src/ragel/smtp_date.rl | 27 ++++ src/ragel/smtp_ip.rl | 4 +- src/ragel/smtp_received.rl | 39 +++++ src/ragel/smtp_received_parser.rl | 235 ++++++++++++++++++++++++++++++ src/ragel/smtp_whitespace.rl | 30 ++++ 7 files changed, 339 insertions(+), 20 deletions(-) create mode 100644 src/ragel/smtp_date.rl create mode 100644 src/ragel/smtp_received.rl create mode 100644 src/ragel/smtp_received_parser.rl create mode 100644 src/ragel/smtp_whitespace.rl diff --git a/src/ragel/smtp_addr_parser.rl b/src/ragel/smtp_addr_parser.rl index a480970ec..70f9c3ca0 100644 --- a/src/ragel/smtp_addr_parser.rl +++ b/src/ragel/smtp_addr_parser.rl @@ -2,6 +2,11 @@ machine smtp_addr_parser; + action IP6_start {} + action IP6_end {} + action IP4_start {} + action IP4_end {} + action User_start { addr->user = p; } diff --git a/src/ragel/smtp_address.rl b/src/ragel/smtp_address.rl index dd148d654..fc69a0138 100644 --- a/src/ragel/smtp_address.rl +++ b/src/ragel/smtp_address.rl @@ -2,28 +2,11 @@ machine smtp_address; include smtp_ip "smtp_ip.rl"; + include smtp_whitespace "smtp_whitespace.rl"; # SMTP address spec # Obtained from: https://tools.ietf.org/html/rfc5321#section-4.1.2 - LF = "\n"; - CR = "\r"; - CRLF = "\r\n"; - DQUOTE = '"'; - - atext = alpha | digit | "!" | "#" | "$" | "%" | "&" | - "'" | "*" | "+" | "_" | "/" | "=" | "?" | "^" | - "-" | "`" | "{" | "|" | "}" | "~"; - - dcontent = 33..90 | 94..126; - Let_dig = alpha | digit; - Ldh_str = ( alpha | digit | "_" | "-" )* Let_dig; - - quoted_pairSMTP = "\\" 32..126; - qtextSMTP = 32..33 | 35..91 | 93..126; - Atom = atext+; - Dot_string = Atom ("." Atom)*; - QcontentSMTP = qtextSMTP | quoted_pairSMTP %User_has_backslash; Quoted_string = ( DQUOTE QcontentSMTP* >User_start %User_end DQUOTE ) %Quoted_addr; Local_part = Dot_string >User_start %User_end | Quoted_string; diff --git a/src/ragel/smtp_date.rl b/src/ragel/smtp_date.rl new file mode 100644 index 000000000..d2efe61a6 --- /dev/null +++ b/src/ragel/smtp_date.rl @@ -0,0 +1,27 @@ +%%{ + machine smtp_date; + + include smtp_whitespace "smtp_whitespace.rl"; + + # SMTP date spec + # Obtained from: http://tools.ietf.org/html/rfc5322#section_3.3 + + digit_2 = digit{2}; + digit_4 = digit{4}; + day_name = "Mon" | "Tue" | "Wed" | "Thu" | + "Fri" | "Sat" | "Sun"; + day_of_week = FWS? day_name; + day = FWS? digit_2+ FWS; + month = "Jan" | "Feb" | "Mar" | "Apr" | + "May" | "Jun" | "Jul" | "Aug" | + "Sep" | "Oct" | "Nov" | "Dec"; + year = FWS digit{4,} FWS; + date = day month year; + hour = digit_2; + minute = digit_2; + second = digit_2; + time_of_day = hour ":" minute ( ":" second ); + zone = (FWS ( "+" |"_" ) digit_4); + time = time_of_day zone; + date_time = (day_of_week ",")? date time CFWS?; +}%% \ No newline at end of file diff --git a/src/ragel/smtp_ip.rl b/src/ragel/smtp_ip.rl index b6b0080f3..b060b750a 100644 --- a/src/ragel/smtp_ip.rl +++ b/src/ragel/smtp_ip.rl @@ -5,7 +5,7 @@ # Source: https://tools.ietf.org/html/rfc5321#section-4.1.3 Snum = digit{1,3}; - IPv4_address_literal = Snum ("." Snum){3}; + IPv4_address_literal = (Snum ("." Snum){3}) >IP4_start %IP4_end; IPv6_hex = xdigit{1,4}; IPv6_full = IPv6_hex (":" IPv6_hex){7}; IPv6_comp = (IPv6_hex (":" IPv6_hex){0,5})? "::" @@ -15,5 +15,5 @@ (IPv6_hex (":" IPv6_hex){0,3} ":")? IPv4_address_literal; IPv6_addr = IPv6_full | IPv6_comp | IPv6v4_full | IPv6v4_comp; - IPv6_address_literal = "IPv6:" IPv6_addr; + IPv6_address_literal = "IPv6:" (IPv6_addr >IP6_start %IP6_end); }%% \ No newline at end of file diff --git a/src/ragel/smtp_received.rl b/src/ragel/smtp_received.rl new file mode 100644 index 000000000..235c54906 --- /dev/null +++ b/src/ragel/smtp_received.rl @@ -0,0 +1,39 @@ +%%{ + machine smtp_received; + + include smtp_whitespace "smtp_whitespace.rl"; + include smtp_ip "smtp_ip.rl"; + include smtp_date "smtp_date.rl"; + include smtp_address"smtp_address.rl"; + + # http://tools.ietf.org/html/rfc5321#section-4.4 + + Addtl_Link = Atom; + Link = "TCP" | Addtl_Link; + Attdl_Protocol = Atom; + Protocol = "ESMTP" %ESMTP_proto | "SMTP" %SMTP_proto | "ESMTPS" %ESMTPS_proto | "LMTP" %LMTP_proto | "IMAP" %IMAP_proto | Attdl_Protocol; + + TCP_info = address_literal >Real_IP_Start %Real_IP_End | + ( Domain >Real_Domain_Start %Real_Domain_End FWS address_literal >Real_IP_Start %Real_IP_End ); + Extended_Domain = Domain >Real_Domain_Start %Real_Domain_End | # Used to be a real domain + ( Domain >Reported_Domain_Start %Reported_Domain_End FWS "(" TCP_info ")" ) | # Here domain is something specified by remote side + ( address_literal >Real_Domain_Start %Real_Domain_End FWS "(" TCP_info ")" ); + + From_domain = "FROM"i FWS Extended_Domain >From_Start %From_End; + By_domain = CFWS "BY"i FWS Extended_Domain >By_Start %By_End; + + Via = CFWS "VIA"i FWS Link; + With = CFWS "WITH"i FWS Protocol; + + id_left = dot_atom_text; + no_fold_literal = "[" dtext* "]"; + id_right = dot_atom_text | no_fold_literal; + msg_id = "<" id_left "@" id_right ">"; + ID = CFWS "ID"i FWS ( Atom | msg_id ); + + For = CFWS "FOR"i FWS ( Path | Mailbox ) %For_End; + Additional_Registered_Clauses = CFWS Atom FWS String; + Opt_info = Via? With? ID? For? Additional_Registered_Clauses?; + Received = From_domain By_domain Opt_info CFWS? ";" FWS date_time; + +}%% diff --git a/src/ragel/smtp_received_parser.rl b/src/ragel/smtp_received_parser.rl new file mode 100644 index 000000000..51cb90720 --- /dev/null +++ b/src/ragel/smtp_received_parser.rl @@ -0,0 +1,235 @@ +%%{ + + machine smtp_received_parser; + + + action IP6_start { + ip_start = p; + } + action IP6_end { + ip_end = p; + } + action IP4_start { + ip_start = p; + } + action IP4_end { + ip_end = p; + } + + action User_start { + addr->user = p; + } + + action User_end { + if (addr->user) { + addr->user_len = p - addr->user; + } + } + + action Domain_start { + addr->domain = p; + } + + action Domain_end { + if (addr->domain) { + addr->domain_len = p - addr->domain; + } + } + + action Domain_addr_start { + addr->domain = p; + addr->flags |= RSPAMD_EMAIL_ADDR_IP; + } + + action Domain_addr_end { + if (addr->domain) { + addr->domain_len = p - addr->domain; + } + } + + action User_has_backslash { + addr->flags |= RSPAMD_EMAIL_ADDR_HAS_BACKSLASH; + } + + action Quoted_addr { + addr->flags |= RSPAMD_EMAIL_ADDR_QUOTED; + } + + action Empty_addr { + addr->flags |= RSPAMD_EMAIL_ADDR_EMPTY; + addr->addr = ""; + addr->user = addr->addr; + addr->domain = addr->addr; + } + + action Valid_addr { + addr->flags |= RSPAMD_EMAIL_ADDR_VALID; + } + + action Addr_has_angle { + addr->flags |= RSPAMD_EMAIL_ADDR_BRACED; + } + + action Addr_start { + addr->addr = p; + } + + action Addr_end { + if (addr->addr) { + addr->addr_len = p - addr->addr; + } + } + + action Real_Domain_Start { + real_domain_start = p; + } + action Real_Domain_End { + real_domain_end = p; + } + action Reported_Domain_Start { + reported_domain_start = p; + } + action Reported_Domain_End { + reported_domain_end = p; + } + + action Real_IP_Start { + real_domain_start = p; + } + action Real_IP_End { + real_domain_end = p; + } + action Reported_IP_Start { + reported_domain_start = p; + } + action Reported_IP_End { + reported_domain_end = p; + } + + action From_Start { + real_domain_start = NULL; + real_domain_end = NULL; + real_ip_start = NULL; + real_ip_end = NULL; + reported_domain_start = NULL; + reported_domain_end = NULL; + reported_ip_start = NULL; + reported_ip_end = NULL; + ip_start = NULL; + ip_end = NULL; + } + + action By_Start { + real_domain_start = NULL; + real_domain_end = NULL; + real_ip_start = NULL; + real_ip_end = NULL; + reported_domain_start = NULL; + reported_domain_end = NULL; + reported_ip_start = NULL; + reported_ip_end = NULL; + ip_start = NULL; + ip_end = NULL; + } + + action By_End { + /* Do nothing here for now */ + } + + action From_End { + guint len; + + if (real_domain_end && real_domain_start && real_domain_end > real_domain_start) { + len = real_domain_end - real_domain_start; + rh->real_hostname = rspamd_mempool_alloc (len + 1); + rspamd_strlcpy (rh->real_hostname, real_domain_start, len + 1); + } + if (reported_domain_end && reported_domain_start && reported_domain_end > reported_domain_start) { + len = reported_domain_end - reported_domain_start; + rh->from_hostname = rspamd_mempool_alloc (len + 1); + rspamd_strlcpy (rh->from_hostname, reported_domain_start, len + 1); + } + if (real_ip_end && real_ip_start && real_ip_end > real_ip_start) { + len = real_ip_end - real_ip_start; + rh->real_ip = rspamd_mempool_alloc (len + 1); + rspamd_strlcpy (rh->real_ip, real_ip_start, len + 1); + } + if (reported_ip_end && reported_ip_start && reported_ip_end > reported_ip_start) { + len = reported_ip_end - reported_ip_start; + rh->from_ip = rspamd_mempool_alloc (len + 1); + rspamd_strlcpy (rh->from_ip, reported_ip_start, len + 1); + } + + if (rh->real_ip && !rh->from_ip) { + rh->from_ip = rh->real_ip; + } + if (rh->real_hostname && !rh->from_hostname) { + rh->from_hostname = rh->real_hostname; + } + + if (rh->real_ip && ip_start && ip_end && ip_end > ip_start) { + if (rspamd_parse_inet_address (&rh->addr, ip_start, ip_end)) { + rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)rspamd_inet_address_destroy, rh->addr); + } + } + } + + action For_End { + + } + + action SMTP_proto { + rh->type = RSPAMD_RECEIVED_SMTP; + } + action ESMTPS_proto { + rh->type = RSPAMD_RECEIVED_ESMTPS; + } + action ESMTP_proto { + rh->type = RSPAMD_RECEIVED_ESMTP; + } + action LMTP_proto { + rh->type = RSPAMD_RECEIVED_LMTP; + } + action IMAP_proto { + rh->type = RSPAMD_RECEIVED_IMAP; + } + + include smtp_received "smtp_received.rl"; + + main := Received; + +}%% + +%% write data; + +static int +rspamd_smtp_recieved_parse (struct rspamd_task *task, const char *data, size_t len, struct received_header *rh) +{ + struct rspamd_email_address for_addr, *addr; + const gchar *real_domain_start, *real_domain_end, + *real_ip_start, *real_ip_end, + *reported_domain_start, *reported_domain_end, + *reported_ip_start, *reported_ip_end, + *ip_start, *ip_end; + + memset (rh, 0, sizeof (*rh)); + real_domain_start = NULL; + real_domain_end = NULL; + real_ip_start = NULL; + real_ip_end = NULL; + reported_domain_start = NULL; + reported_domain_end = NULL; + reported_ip_start = NULL; + reported_ip_end = NULL; + ip_start = NULL; + ip_end = NULL; + rh->type = RSPAMD_RECEIVED_UNKNOWN; + + memset (&for_addr, 0, sizeof (for_addr)); + addr = &for_addr; + + %% write init; + %% write exec; + + return cs; +} \ No newline at end of file diff --git a/src/ragel/smtp_whitespace.rl b/src/ragel/smtp_whitespace.rl new file mode 100644 index 000000000..5bac17a4e --- /dev/null +++ b/src/ragel/smtp_whitespace.rl @@ -0,0 +1,30 @@ +%%{ + machine smtp_whitespace; + + WSP = " "; + CRLF = "\r\n" | ("\r" [^\n]) | ([^\r] "\n"); + DQUOTE = '"'; + + # Printable US-ASCII characters not including specials + atext = alpha | digit | "!" | "#" | "$" | "%" | "&" | + "'" | "*" | "+" | "_" | "/" | "=" | "?" | "^" | + "-" | "`" | "{" | "|" | "}" | "~"; + # Printable US-ASCII characters not including "[", "]", or "\" + dtext = 33..90 | 94..126; + # Printable US-ASCII characters not including "(", ")", or "\" + ctext = 33..39 | 42..91 | 93..126; + + dcontent = 33..90 | 94..126; + Let_dig = alpha | digit; + Ldh_str = ( alpha | digit | "_" | "-" )* Let_dig; + + quoted_pairSMTP = "\\" 32..126; + qtextSMTP = 32..33 | 35..91 | 93..126; + Atom = atext+; + Dot_string = Atom ("." Atom)*; + dot_atom_text = atext+ ("." atext+)*; + FWS = ((WSP* CRLF)? WSP+); + + comment = "(" (FWS? ctext)* FWS? ")"; + CFWS = ((FWS? comment)+ FWS?) | FWS; +}%% \ No newline at end of file -- 2.39.5