]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Implement content type parser for mime
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 12 Dec 2016 13:44:08 +0000 (13:44 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 12 Dec 2016 13:44:08 +0000 (13:44 +0000)
src/CMakeLists.txt
src/libmime/CMakeLists.txt
src/libmime/content_type.c [new file with mode: 0644]
src/libmime/content_type.h [new file with mode: 0644]
src/libmime/smtp_parsers.h
src/ragel/content_type.rl [new file with mode: 0644]
src/ragel/content_type_parser.rl [new file with mode: 0644]

index 943a4dc9e615635afc9539e283b2f118a81dfbe4..02cf7e7f10e5a8359c5ae29e11c5203477dac7d8 100644 (file)
@@ -105,7 +105,8 @@ SET(RAGEL_DEPENDS "${CMAKE_SOURCE_DIR}/src/ragel/smtp_address.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/smtp_date.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/smtp_ip.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/smtp_whitespace.rl"
-       "${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl")
+       "${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl"
+       "${CMAKE_SOURCE_DIR}/src/ragel/content_type.rl")
 RAGEL_TARGET(ragel_smtp_addr
        INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/smtp_addr_parser.rl
        DEPENDS ${RAGEL_DEPENDS}
@@ -121,6 +122,11 @@ RAGEL_TARGET(ragel_newlines_strip
        DEPENDS ${RAGEL_DEPENDS}
        COMPILE_FLAGS -G2
        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
+RAGEL_TARGET(ragel_content_type
+       INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl
+       DEPENDS ${RAGEL_DEPENDS}
+       COMPILE_FLAGS -G2
+       OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/content_type.rl.c)
 ######################### LINK SECTION ###############################
 
 ADD_LIBRARY(rspamd-server STATIC
@@ -134,7 +140,8 @@ ADD_LIBRARY(rspamd-server STATIC
                ${PLUGINSSRC}
                "${RAGEL_ragel_smtp_addr_OUTPUTS}"
                "${RAGEL_ragel_smtp_received_OUTPUTS}"
-               "${RAGEL_ragel_newlines_strip_OUTPUTS}")
+               "${RAGEL_ragel_newlines_strip_OUTPUTS}"
+               "${RAGEL_ragel_content_type_OUTPUTS}")
 TARGET_LINK_LIBRARIES(rspamd-server rspamd-http-parser)
 TARGET_LINK_LIBRARIES(rspamd-server rspamd-cdb)
 TARGET_LINK_LIBRARIES(rspamd-server rspamd-lpeg)
index a4485461a998ac3ae70eedb4f2cf4296dd65c330..39bd2d4026c8c6e40e96bbf7643827cf93aac287 100644 (file)
@@ -5,6 +5,7 @@ SET(LIBRSPAMDMIMESRC
                                ${CMAKE_CURRENT_SOURCE_DIR}/filter.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/images.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/message.c
-                               ${CMAKE_CURRENT_SOURCE_DIR}/archives.c)
+                               ${CMAKE_CURRENT_SOURCE_DIR}/archives.c
+                               ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c)
 
 SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/libmime/content_type.c b/src/libmime/content_type.c
new file mode 100644 (file)
index 0000000..9161850
--- /dev/null
@@ -0,0 +1,79 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "libmime/content_type.h"
+#include "smtp_parsers.h"
+#include "utlist.h"
+
+void
+rspamd_content_type_add_param (rspamd_mempool_t *pool,
+               struct rspamd_content_type *ct,
+               const gchar *name_start, const gchar *name_end,
+               const gchar *value_start, const gchar *value_end)
+{
+       rspamd_ftok_t srch;
+       struct rspamd_content_type_param *found = NULL, *nparam;
+
+       g_assert (ct != NULL);
+
+       srch.begin = name_start;
+       srch.len = name_end - name_start;
+
+       if (ct->attrs) {
+               found = g_hash_table_lookup (ct->attrs, &srch);
+       }
+       else {
+               ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
+                               rspamd_ftok_icase_equal);
+       }
+
+       nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
+       nparam->name.begin = name_start;
+       nparam->name.len = name_end - name_start;
+       nparam->value.begin = value_start;
+       nparam->value.len = value_end - value_start;
+       DL_APPEND (found, nparam);
+
+       if (!found) {
+               g_hash_table_insert (ct->attrs, &nparam->name, nparam);
+       }
+}
+
+struct rspamd_content_type *
+rspamd_content_type_parse (const gchar *in,
+               gsize len, rspamd_mempool_t *pool)
+{
+       struct rspamd_content_type *res = NULL, val;
+
+       val.lc_data = rspamd_mempool_alloc (pool, len);
+       memcpy (val.lc_data, in, len);
+       rspamd_str_lc (val.lc_data, len);
+
+       if (rspamd_content_type_parser (val.lc_data, len, &val, pool)) {
+               res = rspamd_mempool_alloc (pool, sizeof (val));
+               memcpy (res, &val, sizeof (val));
+
+               if (res->attrs) {
+                       rspamd_mempool_add_destructor (pool,
+                                       (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
+               }
+       }
+       else {
+               msg_warn_pool ("cannot parse content type: %*s", (gint)len, val.lc_data);
+       }
+
+       return res;
+}
diff --git a/src/libmime/content_type.h b/src/libmime/content_type.h
new file mode 100644 (file)
index 0000000..7c12993
--- /dev/null
@@ -0,0 +1,61 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_CONTENT_TYPE_H_
+#define SRC_LIBMIME_CONTENT_TYPE_H_
+
+#include "config.h"
+#include "libutil/fstring.h"
+#include "libutil/mem_pool.h"
+
+struct rspamd_content_type_param {
+       rspamd_ftok_t name;
+       rspamd_ftok_t value;
+       struct rspamd_content_type_param *prev, *next;
+};
+
+struct rspamd_content_type {
+       gchar *lc_data;
+       rspamd_ftok_t type;
+       rspamd_ftok_t subtype;
+       rspamd_ftok_t charset;
+       GHashTable *attrs; /* Can be empty */
+};
+
+/**
+ * Adds new parameter to content type structure
+ * @param ct
+ * @param name_start
+ * @param name_end
+ * @param value_start
+ * @param value_end
+ */
+void
+rspamd_content_type_add_param (rspamd_mempool_t *pool,
+               struct rspamd_content_type *ct,
+               const gchar *name_start, const gchar *name_end,
+               const gchar *value_start, const gchar *value_end);
+
+/**
+ * Parse content type from the header (performs copy + lowercase)
+ * @param in
+ * @param len
+ * @param pool
+ * @return
+ */
+struct rspamd_content_type * rspamd_content_type_parse (const gchar *in,
+               gsize len, rspamd_mempool_t *pool);
+
+#endif /* SRC_LIBMIME_CONTENT_TYPE_H_ */
index 07bd246885b999ebd437e666c01868f9eb62ae2f..0d6e2341316fbd20ded21c8ef5505843e2729f42 100644 (file)
@@ -18,6 +18,7 @@
 
 #include "config.h"
 #include "email_addr.h"
+#include "content_type.h"
 #include "task.h"
 #include "message.h"
 
@@ -30,4 +31,7 @@ void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                GByteArray *data, gboolean is_html, guint *newlines_count,
                GPtrArray *newlines);
 
+gboolean rspamd_content_type_parser (const char *data, size_t len,
+               struct rspamd_content_type *ct, rspamd_mempool_t *pool);
+
 #endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */
diff --git a/src/ragel/content_type.rl b/src/ragel/content_type.rl
new file mode 100644 (file)
index 0000000..d9c222e
--- /dev/null
@@ -0,0 +1,40 @@
+%%{
+  machine content_type;
+  include smtp_whitespace "smtp_whitespace.rl";
+
+  # https://tools.ietf.org/html/rfc2045#section-5.1
+
+  ccontent = ctext | FWS | '(' @{ fcall balanced_ccontent; };
+  balanced_ccontent := ccontent* ')' @{ fret; };
+  comment        =   "(" (FWS? ccontent)* FWS? ")";
+  CFWS           =   ((FWS? comment)+ FWS?) | FWS;
+  qcontent = qtextSMTP | quoted_pairSMTP;
+  quoted_string = CFWS?
+                  (DQUOTE
+                    (((FWS? qcontent)* FWS?) >Quoted_Str_Start %Quoted_Str_End)
+                  DQUOTE) CFWS?;
+  token = 0x21..0x27 | 0x2a..0x2b | 0x2c..0x2e | 0x30..0x39 | 0x41..0x5a | 0x5e..0x7e;
+  value = (quoted_string | (token -- '"' | 0x3d)+) >Param_Value_Start %Param_Value_End;
+  attribute = (token+) >Param_Name_Start %Param_Name_End;
+  parameter = CFWS? attribute "=" value CFWS?;
+
+  ietf_token = token+;
+  custom_x_token = 'x'i "-" token+;
+  extension_token = ietf_token | custom_x_token;
+  discrete_type = 'text'i | 'image'i | 'audio'i | 'video'i |
+                  'application'i | extension_token;
+  composite_type = 'message'i | 'multipart'i | extension_token;
+  iana_token = token+;
+  main_type = (discrete_type | composite_type) >Type_Start %Type_End;
+  sub_type = (extension_token | iana_token) >Subtype_Start %Subtype_End;
+  content_type = main_type ("/" sub_type)? (((CFWS? ";"+) | CFWS) parameter CFWS?)*;
+
+  prepush {
+    if (top >= st_storage.size) {
+      st_storage.size = (top + 1) * 2;
+      st_storage.data = realloc (st_storage.data, st_storage.size * sizeof (int));
+      g_assert (st_storage.data != NULL);
+      stack = st_storage.data;
+    }
+  }
+}%%
\ No newline at end of file
diff --git a/src/ragel/content_type_parser.rl b/src/ragel/content_type_parser.rl
new file mode 100644 (file)
index 0000000..aec3db2
--- /dev/null
@@ -0,0 +1,152 @@
+%%{
+  machine content_type_parser;
+
+  action Type_Start {
+    qstart = NULL;
+    qend = NULL;
+    ct->type.begin = p;
+  }
+
+  action Type_End {
+    if (qstart) {
+      ct->type.begin = qstart;
+    }
+    if (qend && qend >= qstart) {
+      ct->type.len = qend - qstart;
+    }
+    else if (p >= ct->type.begin) {
+      ct->type.len = p - ct->type.begin;
+    }
+    qstart = NULL;
+    qend = NULL;
+  }
+
+  action Subtype_Start {
+    qstart = NULL;
+    qend = NULL;
+    ct->subtype.begin = p;
+  }
+
+  action Subtype_End {
+    if (qstart) {
+      ct->subtype.begin = qstart;
+    }
+    if (qend && qend >= qstart) {
+      ct->subtype.len = qend - qstart;
+    }
+    else if (p >= ct->subtype.begin) {
+      ct->subtype.len = p - ct->subtype.begin;
+    }
+    qstart = NULL;
+    qend = NULL;
+  }
+
+  action Param_Name_Start {
+    qstart = NULL;
+    qend = NULL;
+    pname_start = p;
+    pname_end = NULL;
+  }
+
+
+  action Param_Name_End {
+    if (qstart) {
+      pname_start = qstart;
+    }
+    if (qend && qend >= qstart) {
+      pname_end = qend;
+    }
+    else if (p >= pname_start) {
+      pname_end = p;
+    }
+    qstart = NULL;
+    qend = NULL;
+  }
+
+
+  action Param_Value_Start {
+    qstart = NULL;
+    qend = NULL;
+
+    if (pname_end) {
+      pvalue_start = p;
+      pvalue_end = NULL;
+    }
+  }
+
+
+  action Param_Value_End {
+    if (pname_end) {
+      if (qstart) {
+        pvalue_start = qstart;
+      }
+      if (qend && qend >= qstart) {
+        pvalue_end = qend;
+      }
+      else if (p >= pvalue_start) {
+        pvalue_end = p;
+      }
+      qstart = NULL;
+      qend = NULL;
+
+      if (pvalue_end && pvalue_end > pvalue_start && pname_end > pname_start) {
+        rspamd_content_type_add_param (pool, ct, pname_start, pname_end, pvalue_start, pvalue_end);
+      }
+    }
+
+    pname_start = NULL;
+    pname_end = NULL;
+    pvalue_start = NULL;
+    pvalue_end = NULL;
+    qend = NULL;
+    qstart = NULL;
+  }
+
+  action Quoted_Str_Start {
+    qstart = p;
+    qend = NULL;
+  }
+
+  action Quoted_Str_End {
+    if (qstart) {
+      qend = p;
+    }
+  }
+
+
+  include content_type "content_type.rl";
+
+  main := content_type;
+
+}%%
+
+#include "smtp_parsers.h"
+#include "content_type.h"
+
+%% write data;
+
+gboolean
+rspamd_content_type_parser (const char *data, size_t len, struct rspamd_content_type *ct, rspamd_mempool_t *pool)
+{
+  const char *p = data, *pe = data + len, *eof, *qstart = NULL, *qend = NULL,
+    *pname_start = NULL, *pname_end = NULL, *pvalue_start, *pvalue_end;
+  int cs, *stack = NULL;
+  gsize top = 0;
+  struct _ragel_st_storage {
+    int *data;
+    gsize size;
+  } st_storage;
+
+  memset (&st_storage, 0, sizeof (st_storage));
+  memset (ct, 0, sizeof (*ct));
+  eof = pe;
+
+  %% write init;
+  %% write exec;
+
+  if (st_storage.data) {
+    free (st_storage.data);
+  }
+
+  return ct->type.len > 0;
+}