]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Implement ragel parser for received headers
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Jun 2016 15:36:20 +0000 (16:36 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Jun 2016 15:36:20 +0000 (16:36 +0100)
src/ragel/smtp_addr_parser.rl
src/ragel/smtp_address.rl
src/ragel/smtp_date.rl [new file with mode: 0644]
src/ragel/smtp_ip.rl
src/ragel/smtp_received.rl [new file with mode: 0644]
src/ragel/smtp_received_parser.rl [new file with mode: 0644]
src/ragel/smtp_whitespace.rl [new file with mode: 0644]

index a480970ec3149d266a10f96054dcf6044fef1040..70f9c3ca0d73d7e44f8f4a56e84c9b73ed96c878 100644 (file)
@@ -2,6 +2,11 @@
 
   machine smtp_addr_parser;
 
+  action IP6_start {}
+  action IP6_end {}
+  action IP4_start {}
+  action IP4_end {}
+
   action User_start {
     addr->user = p;
   }
index dd148d6543c64e8f422ca8f023667af18dab4cab..fc69a01389ec5ae8d4906815dbd701435ea23fce 100644 (file)
@@ -2,28 +2,11 @@
   machine smtp_address;
 
   include smtp_ip "smtp_ip.rl";
+  include smtp_whitespace "smtp_whitespace.rl";
 
   # SMTP address spec
   # Obtained from: https://tools.ietf.org/html/rfc5321#section-4.1.2
 
-  LF = "\n";
-  CR = "\r";
-  CRLF = "\r\n";
-  DQUOTE = '"';
-
-  atext = alpha | digit | "!" | "#" | "$" | "%" | "&" |
-          "'" | "*" | "+" | "_" | "/" | "=" | "?" | "^" |
-          "-" | "`" | "{" | "|" | "}" | "~";
-
-  dcontent       = 33..90 | 94..126;
-  Let_dig        = alpha | digit;
-  Ldh_str        = ( alpha | digit | "_" | "-" )* Let_dig;
-
-  quoted_pairSMTP  = "\\" 32..126;
-  qtextSMTP      = 32..33 | 35..91 | 93..126;
-  Atom           = atext+;
-  Dot_string     = Atom ("."  Atom)*;
-
   QcontentSMTP   = qtextSMTP | quoted_pairSMTP %User_has_backslash;
   Quoted_string  = ( DQUOTE QcontentSMTP* >User_start %User_end DQUOTE ) %Quoted_addr;
   Local_part     = Dot_string >User_start %User_end | Quoted_string;
diff --git a/src/ragel/smtp_date.rl b/src/ragel/smtp_date.rl
new file mode 100644 (file)
index 0000000..d2efe61
--- /dev/null
@@ -0,0 +1,27 @@
+%%{
+  machine smtp_date;
+
+  include smtp_whitespace "smtp_whitespace.rl";
+
+  # SMTP date spec
+  # Obtained from: http://tools.ietf.org/html/rfc5322#section_3.3
+
+  digit_2         =   digit{2};
+  digit_4         =   digit{4};
+  day_name        =    "Mon" | "Tue" | "Wed" | "Thu" |
+                       "Fri" | "Sat" | "Sun";
+  day_of_week     =   FWS? day_name;
+  day             =   FWS? digit_2+ FWS;
+  month           =    "Jan" | "Feb" | "Mar" | "Apr" |
+                       "May" | "Jun" | "Jul" | "Aug" |
+                       "Sep" | "Oct" | "Nov" | "Dec";
+  year            =   FWS digit{4,} FWS;
+  date            =   day month year;
+  hour            =   digit_2;
+  minute          =   digit_2;
+  second          =   digit_2;
+  time_of_day     =   hour ":" minute ( ":" second );
+  zone            =   (FWS ( "+" |"_" ) digit_4);
+  time            =   time_of_day zone;
+  date_time       =   (day_of_week ",")? date time CFWS?;
+}%%
\ No newline at end of file
index b6b0080f36011f8cf4530282105f1dffee3174ea..b060b750afe783ed2e09e0640e0cb26e18ff8b9f 100644 (file)
@@ -5,7 +5,7 @@
   # Source: https://tools.ietf.org/html/rfc5321#section-4.1.3
 
   Snum           = digit{1,3};
-  IPv4_address_literal  = Snum ("."  Snum){3};
+  IPv4_address_literal  = (Snum ("."  Snum){3}) >IP4_start %IP4_end;
   IPv6_hex       = xdigit{1,4};
   IPv6_full      = IPv6_hex (":" IPv6_hex){7};
   IPv6_comp      = (IPv6_hex (":" IPv6_hex){0,5})? "::"
@@ -15,5 +15,5 @@
                   (IPv6_hex (":" IPv6_hex){0,3} ":")?
                   IPv4_address_literal;
   IPv6_addr      = IPv6_full | IPv6_comp | IPv6v4_full | IPv6v4_comp;
-  IPv6_address_literal  = "IPv6:" IPv6_addr;
+  IPv6_address_literal  = "IPv6:" (IPv6_addr >IP6_start %IP6_end);
 }%%
\ No newline at end of file
diff --git a/src/ragel/smtp_received.rl b/src/ragel/smtp_received.rl
new file mode 100644 (file)
index 0000000..235c549
--- /dev/null
@@ -0,0 +1,39 @@
+%%{
+  machine smtp_received;
+
+  include smtp_whitespace "smtp_whitespace.rl";
+  include smtp_ip "smtp_ip.rl";
+  include smtp_date "smtp_date.rl";
+  include smtp_address"smtp_address.rl";
+
+  # http://tools.ietf.org/html/rfc5321#section-4.4
+
+  Addtl_Link     = Atom;
+  Link           = "TCP" | Addtl_Link;
+  Attdl_Protocol = Atom;
+  Protocol       = "ESMTP" %ESMTP_proto | "SMTP" %SMTP_proto | "ESMTPS" %ESMTPS_proto | "LMTP" %LMTP_proto | "IMAP" %IMAP_proto | Attdl_Protocol;
+
+  TCP_info       = address_literal >Real_IP_Start %Real_IP_End |
+                  ( Domain >Real_Domain_Start %Real_Domain_End FWS address_literal >Real_IP_Start %Real_IP_End );
+  Extended_Domain  = Domain >Real_Domain_Start %Real_Domain_End | # Used to be a real domain
+                  ( Domain >Reported_Domain_Start %Reported_Domain_End FWS "(" TCP_info ")" ) | # Here domain is something specified by remote side
+                  ( address_literal >Real_Domain_Start %Real_Domain_End FWS "(" TCP_info ")" );
+
+  From_domain    = "FROM"i FWS Extended_Domain >From_Start %From_End;
+  By_domain      = CFWS "BY"i FWS Extended_Domain >By_Start %By_End;
+
+  Via            = CFWS "VIA"i FWS Link;
+  With           = CFWS "WITH"i FWS Protocol;
+
+  id_left        = dot_atom_text;
+  no_fold_literal = "[" dtext* "]";
+  id_right       = dot_atom_text | no_fold_literal;
+  msg_id         = "<" id_left "@" id_right ">";
+  ID             = CFWS "ID"i FWS ( Atom | msg_id );
+
+  For            = CFWS "FOR"i FWS ( Path | Mailbox ) %For_End;
+  Additional_Registered_Clauses  = CFWS Atom FWS String;
+  Opt_info       = Via? With? ID? For? Additional_Registered_Clauses?;
+  Received       = From_domain By_domain Opt_info CFWS? ";" FWS date_time;
+
+}%%
diff --git a/src/ragel/smtp_received_parser.rl b/src/ragel/smtp_received_parser.rl
new file mode 100644 (file)
index 0000000..51cb907
--- /dev/null
@@ -0,0 +1,235 @@
+%%{
+
+  machine smtp_received_parser;
+
+
+  action IP6_start {
+    ip_start = p;
+  }
+  action IP6_end {
+    ip_end = p;
+  }
+  action IP4_start {
+    ip_start = p;
+  }
+  action IP4_end {
+    ip_end = p;
+  }
+
+  action User_start {
+    addr->user = p;
+  }
+
+  action User_end {
+    if (addr->user) {
+      addr->user_len = p - addr->user;
+    }
+  }
+
+  action Domain_start {
+    addr->domain = p;
+  }
+
+  action Domain_end {
+    if (addr->domain) {
+      addr->domain_len = p - addr->domain;
+    }
+  }
+
+  action Domain_addr_start {
+    addr->domain = p;
+    addr->flags |= RSPAMD_EMAIL_ADDR_IP;
+  }
+
+  action Domain_addr_end {
+    if (addr->domain) {
+      addr->domain_len = p - addr->domain;
+    }
+  }
+
+  action User_has_backslash {
+    addr->flags |= RSPAMD_EMAIL_ADDR_HAS_BACKSLASH;
+  }
+
+  action Quoted_addr {
+    addr->flags |= RSPAMD_EMAIL_ADDR_QUOTED;
+  }
+
+  action Empty_addr {
+    addr->flags |= RSPAMD_EMAIL_ADDR_EMPTY;
+    addr->addr = "";
+    addr->user = addr->addr;
+    addr->domain = addr->addr;
+  }
+
+  action Valid_addr {
+    addr->flags |= RSPAMD_EMAIL_ADDR_VALID;
+  }
+
+  action Addr_has_angle {
+    addr->flags |= RSPAMD_EMAIL_ADDR_BRACED;
+  }
+
+  action Addr_start {
+    addr->addr = p;
+  }
+
+  action Addr_end {
+    if (addr->addr) {
+      addr->addr_len = p - addr->addr;
+    }
+  }
+
+  action Real_Domain_Start {
+    real_domain_start = p;
+  }
+  action Real_Domain_End {
+    real_domain_end = p;
+  }
+  action Reported_Domain_Start {
+    reported_domain_start = p;
+  }
+  action Reported_Domain_End {
+    reported_domain_end = p;
+  }
+
+  action Real_IP_Start {
+    real_domain_start = p;
+  }
+  action Real_IP_End {
+    real_domain_end = p;
+  }
+  action Reported_IP_Start {
+    reported_domain_start = p;
+  }
+  action Reported_IP_End {
+    reported_domain_end = p;
+  }
+
+  action From_Start {
+    real_domain_start = NULL;
+    real_domain_end = NULL;
+    real_ip_start = NULL;
+    real_ip_end = NULL;
+    reported_domain_start = NULL;
+    reported_domain_end = NULL;
+    reported_ip_start = NULL;
+    reported_ip_end = NULL;
+    ip_start = NULL;
+    ip_end = NULL;
+  }
+
+  action By_Start {
+    real_domain_start = NULL;
+    real_domain_end = NULL;
+    real_ip_start = NULL;
+    real_ip_end = NULL;
+    reported_domain_start = NULL;
+    reported_domain_end = NULL;
+    reported_ip_start = NULL;
+    reported_ip_end = NULL;
+    ip_start = NULL;
+    ip_end = NULL;
+  }
+
+  action By_End {
+    /* Do nothing here for now */
+  }
+
+  action From_End {
+    guint len;
+
+    if (real_domain_end && real_domain_start && real_domain_end > real_domain_start) {
+      len = real_domain_end - real_domain_start;
+      rh->real_hostname = rspamd_mempool_alloc (len + 1);
+      rspamd_strlcpy (rh->real_hostname, real_domain_start, len + 1);
+    }
+    if (reported_domain_end && reported_domain_start && reported_domain_end > reported_domain_start) {
+      len = reported_domain_end - reported_domain_start;
+      rh->from_hostname = rspamd_mempool_alloc (len + 1);
+      rspamd_strlcpy (rh->from_hostname, reported_domain_start, len + 1);
+    }
+    if (real_ip_end && real_ip_start && real_ip_end > real_ip_start) {
+      len = real_ip_end - real_ip_start;
+      rh->real_ip = rspamd_mempool_alloc (len + 1);
+      rspamd_strlcpy (rh->real_ip, real_ip_start, len + 1);
+    }
+    if (reported_ip_end && reported_ip_start && reported_ip_end > reported_ip_start) {
+      len = reported_ip_end - reported_ip_start;
+      rh->from_ip = rspamd_mempool_alloc (len + 1);
+      rspamd_strlcpy (rh->from_ip, reported_ip_start, len + 1);
+    }
+
+    if (rh->real_ip && !rh->from_ip) {
+      rh->from_ip = rh->real_ip;
+    }
+    if (rh->real_hostname && !rh->from_hostname) {
+      rh->from_hostname = rh->real_hostname;
+    }
+
+    if (rh->real_ip && ip_start && ip_end && ip_end > ip_start) {
+      if (rspamd_parse_inet_address (&rh->addr, ip_start, ip_end)) {
+        rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)rspamd_inet_address_destroy, rh->addr);
+      }
+    }
+  }
+
+  action For_End {
+
+  }
+
+  action SMTP_proto {
+    rh->type = RSPAMD_RECEIVED_SMTP;
+  }
+  action ESMTPS_proto {
+    rh->type = RSPAMD_RECEIVED_ESMTPS;
+  }
+  action ESMTP_proto {
+    rh->type = RSPAMD_RECEIVED_ESMTP;
+  }
+  action LMTP_proto {
+    rh->type = RSPAMD_RECEIVED_LMTP;
+  }
+  action IMAP_proto {
+    rh->type = RSPAMD_RECEIVED_IMAP;
+  }
+
+  include smtp_received "smtp_received.rl";
+
+  main := Received;
+
+}%%
+
+%% write data;
+
+static int
+rspamd_smtp_recieved_parse (struct rspamd_task *task, const char *data, size_t len, struct received_header *rh)
+{
+  struct rspamd_email_address for_addr, *addr;
+  const gchar *real_domain_start, *real_domain_end,
+              *real_ip_start, *real_ip_end,
+              *reported_domain_start, *reported_domain_end,
+              *reported_ip_start, *reported_ip_end,
+              *ip_start, *ip_end;
+
+  memset (rh, 0, sizeof (*rh));
+  real_domain_start = NULL;
+  real_domain_end = NULL;
+  real_ip_start = NULL;
+  real_ip_end = NULL;
+  reported_domain_start = NULL;
+  reported_domain_end = NULL;
+  reported_ip_start = NULL;
+  reported_ip_end = NULL;
+  ip_start = NULL;
+  ip_end = NULL;
+  rh->type = RSPAMD_RECEIVED_UNKNOWN;
+
+  memset (&for_addr, 0, sizeof (for_addr));
+  addr = &for_addr;
+
+  %% write init;
+  %% write exec;
+
+  return cs;
+}
\ No newline at end of file
diff --git a/src/ragel/smtp_whitespace.rl b/src/ragel/smtp_whitespace.rl
new file mode 100644 (file)
index 0000000..5bac17a
--- /dev/null
@@ -0,0 +1,30 @@
+%%{
+  machine smtp_whitespace;
+
+  WSP             =   " ";
+  CRLF            =   "\r\n" | ("\r" [^\n]) | ([^\r] "\n");
+  DQUOTE = '"';
+
+  # Printable US-ASCII characters not including specials
+  atext = alpha | digit | "!" | "#" | "$" | "%" | "&" |
+          "'" | "*" | "+" | "_" | "/" | "=" | "?" | "^" |
+          "-" | "`" | "{" | "|" | "}" | "~";
+  # Printable US-ASCII characters not including "[", "]", or "\"
+  dtext = 33..90 | 94..126;
+  # Printable US-ASCII characters not including  "(", ")", or "\"
+  ctext = 33..39 | 42..91 | 93..126;
+
+  dcontent       = 33..90 | 94..126;
+  Let_dig        = alpha | digit;
+  Ldh_str        = ( alpha | digit | "_" | "-" )* Let_dig;
+
+  quoted_pairSMTP  = "\\" 32..126;
+  qtextSMTP      = 32..33 | 35..91 | 93..126;
+  Atom           = atext+;
+  Dot_string     = Atom ("."  Atom)*;
+  dot_atom_text  = atext+ ("." atext+)*;
+  FWS            =   ((WSP* CRLF)? WSP+);
+
+  comment        =   "(" (FWS? ctext)* FWS? ")";
+  CFWS           =   ((FWS? comment)+ FWS?) | FWS;
+}%%
\ No newline at end of file