Browse Source

[Rework] Implement content type parser for mime

tags/1.5.0
Vsevolod Stakhov 7 years ago
parent
commit
1d95f16786

+ 9
- 2
src/CMakeLists.txt View File

@@ -105,7 +105,8 @@ SET(RAGEL_DEPENDS "${CMAKE_SOURCE_DIR}/src/ragel/smtp_address.rl"
"${CMAKE_SOURCE_DIR}/src/ragel/smtp_date.rl"
"${CMAKE_SOURCE_DIR}/src/ragel/smtp_ip.rl"
"${CMAKE_SOURCE_DIR}/src/ragel/smtp_whitespace.rl"
"${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl")
"${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl"
"${CMAKE_SOURCE_DIR}/src/ragel/content_type.rl")
RAGEL_TARGET(ragel_smtp_addr
INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/smtp_addr_parser.rl
DEPENDS ${RAGEL_DEPENDS}
@@ -121,6 +122,11 @@ RAGEL_TARGET(ragel_newlines_strip
DEPENDS ${RAGEL_DEPENDS}
COMPILE_FLAGS -G2
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
RAGEL_TARGET(ragel_content_type
INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl
DEPENDS ${RAGEL_DEPENDS}
COMPILE_FLAGS -G2
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/content_type.rl.c)
######################### LINK SECTION ###############################

ADD_LIBRARY(rspamd-server STATIC
@@ -134,7 +140,8 @@ ADD_LIBRARY(rspamd-server STATIC
${PLUGINSSRC}
"${RAGEL_ragel_smtp_addr_OUTPUTS}"
"${RAGEL_ragel_smtp_received_OUTPUTS}"
"${RAGEL_ragel_newlines_strip_OUTPUTS}")
"${RAGEL_ragel_newlines_strip_OUTPUTS}"
"${RAGEL_ragel_content_type_OUTPUTS}")
TARGET_LINK_LIBRARIES(rspamd-server rspamd-http-parser)
TARGET_LINK_LIBRARIES(rspamd-server rspamd-cdb)
TARGET_LINK_LIBRARIES(rspamd-server rspamd-lpeg)

+ 2
- 1
src/libmime/CMakeLists.txt View File

@@ -5,6 +5,7 @@ SET(LIBRSPAMDMIMESRC
${CMAKE_CURRENT_SOURCE_DIR}/filter.c
${CMAKE_CURRENT_SOURCE_DIR}/images.c
${CMAKE_CURRENT_SOURCE_DIR}/message.c
${CMAKE_CURRENT_SOURCE_DIR}/archives.c)
${CMAKE_CURRENT_SOURCE_DIR}/archives.c
${CMAKE_CURRENT_SOURCE_DIR}/content_type.c)

SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)

+ 79
- 0
src/libmime/content_type.c View File

@@ -0,0 +1,79 @@
/*-
* Copyright 2016 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "libmime/content_type.h"
#include "smtp_parsers.h"
#include "utlist.h"

void
rspamd_content_type_add_param (rspamd_mempool_t *pool,
struct rspamd_content_type *ct,
const gchar *name_start, const gchar *name_end,
const gchar *value_start, const gchar *value_end)
{
rspamd_ftok_t srch;
struct rspamd_content_type_param *found = NULL, *nparam;

g_assert (ct != NULL);

srch.begin = name_start;
srch.len = name_end - name_start;

if (ct->attrs) {
found = g_hash_table_lookup (ct->attrs, &srch);
}
else {
ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
rspamd_ftok_icase_equal);
}

nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
nparam->name.begin = name_start;
nparam->name.len = name_end - name_start;
nparam->value.begin = value_start;
nparam->value.len = value_end - value_start;
DL_APPEND (found, nparam);

if (!found) {
g_hash_table_insert (ct->attrs, &nparam->name, nparam);
}
}

struct rspamd_content_type *
rspamd_content_type_parse (const gchar *in,
gsize len, rspamd_mempool_t *pool)
{
struct rspamd_content_type *res = NULL, val;

val.lc_data = rspamd_mempool_alloc (pool, len);
memcpy (val.lc_data, in, len);
rspamd_str_lc (val.lc_data, len);

if (rspamd_content_type_parser (val.lc_data, len, &val, pool)) {
res = rspamd_mempool_alloc (pool, sizeof (val));
memcpy (res, &val, sizeof (val));

if (res->attrs) {
rspamd_mempool_add_destructor (pool,
(rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
}
}
else {
msg_warn_pool ("cannot parse content type: %*s", (gint)len, val.lc_data);
}

return res;
}

+ 61
- 0
src/libmime/content_type.h View File

@@ -0,0 +1,61 @@
/*-
* Copyright 2016 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SRC_LIBMIME_CONTENT_TYPE_H_
#define SRC_LIBMIME_CONTENT_TYPE_H_

#include "config.h"
#include "libutil/fstring.h"
#include "libutil/mem_pool.h"

struct rspamd_content_type_param {
rspamd_ftok_t name;
rspamd_ftok_t value;
struct rspamd_content_type_param *prev, *next;
};

struct rspamd_content_type {
gchar *lc_data;
rspamd_ftok_t type;
rspamd_ftok_t subtype;
rspamd_ftok_t charset;
GHashTable *attrs; /* Can be empty */
};

/**
* Adds new parameter to content type structure
* @param ct
* @param name_start
* @param name_end
* @param value_start
* @param value_end
*/
void
rspamd_content_type_add_param (rspamd_mempool_t *pool,
struct rspamd_content_type *ct,
const gchar *name_start, const gchar *name_end,
const gchar *value_start, const gchar *value_end);

/**
* Parse content type from the header (performs copy + lowercase)
* @param in
* @param len
* @param pool
* @return
*/
struct rspamd_content_type * rspamd_content_type_parse (const gchar *in,
gsize len, rspamd_mempool_t *pool);

#endif /* SRC_LIBMIME_CONTENT_TYPE_H_ */

+ 4
- 0
src/libmime/smtp_parsers.h View File

@@ -18,6 +18,7 @@

#include "config.h"
#include "email_addr.h"
#include "content_type.h"
#include "task.h"
#include "message.h"

@@ -30,4 +31,7 @@ void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
GByteArray *data, gboolean is_html, guint *newlines_count,
GPtrArray *newlines);

gboolean rspamd_content_type_parser (const char *data, size_t len,
struct rspamd_content_type *ct, rspamd_mempool_t *pool);

#endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */

+ 40
- 0
src/ragel/content_type.rl View File

@@ -0,0 +1,40 @@
%%{
machine content_type;
include smtp_whitespace "smtp_whitespace.rl";

# https://tools.ietf.org/html/rfc2045#section-5.1

ccontent = ctext | FWS | '(' @{ fcall balanced_ccontent; };
balanced_ccontent := ccontent* ')' @{ fret; };
comment = "(" (FWS? ccontent)* FWS? ")";
CFWS = ((FWS? comment)+ FWS?) | FWS;
qcontent = qtextSMTP | quoted_pairSMTP;
quoted_string = CFWS?
(DQUOTE
(((FWS? qcontent)* FWS?) >Quoted_Str_Start %Quoted_Str_End)
DQUOTE) CFWS?;
token = 0x21..0x27 | 0x2a..0x2b | 0x2c..0x2e | 0x30..0x39 | 0x41..0x5a | 0x5e..0x7e;
value = (quoted_string | (token -- '"' | 0x3d)+) >Param_Value_Start %Param_Value_End;
attribute = (token+) >Param_Name_Start %Param_Name_End;
parameter = CFWS? attribute "=" value CFWS?;

ietf_token = token+;
custom_x_token = 'x'i "-" token+;
extension_token = ietf_token | custom_x_token;
discrete_type = 'text'i | 'image'i | 'audio'i | 'video'i |
'application'i | extension_token;
composite_type = 'message'i | 'multipart'i | extension_token;
iana_token = token+;
main_type = (discrete_type | composite_type) >Type_Start %Type_End;
sub_type = (extension_token | iana_token) >Subtype_Start %Subtype_End;
content_type = main_type ("/" sub_type)? (((CFWS? ";"+) | CFWS) parameter CFWS?)*;

prepush {
if (top >= st_storage.size) {
st_storage.size = (top + 1) * 2;
st_storage.data = realloc (st_storage.data, st_storage.size * sizeof (int));
g_assert (st_storage.data != NULL);
stack = st_storage.data;
}
}
}%%

+ 152
- 0
src/ragel/content_type_parser.rl View File

@@ -0,0 +1,152 @@
%%{
machine content_type_parser;

action Type_Start {
qstart = NULL;
qend = NULL;
ct->type.begin = p;
}

action Type_End {
if (qstart) {
ct->type.begin = qstart;
}
if (qend && qend >= qstart) {
ct->type.len = qend - qstart;
}
else if (p >= ct->type.begin) {
ct->type.len = p - ct->type.begin;
}
qstart = NULL;
qend = NULL;
}

action Subtype_Start {
qstart = NULL;
qend = NULL;
ct->subtype.begin = p;
}

action Subtype_End {
if (qstart) {
ct->subtype.begin = qstart;
}
if (qend && qend >= qstart) {
ct->subtype.len = qend - qstart;
}
else if (p >= ct->subtype.begin) {
ct->subtype.len = p - ct->subtype.begin;
}
qstart = NULL;
qend = NULL;
}

action Param_Name_Start {
qstart = NULL;
qend = NULL;
pname_start = p;
pname_end = NULL;
}


action Param_Name_End {
if (qstart) {
pname_start = qstart;
}
if (qend && qend >= qstart) {
pname_end = qend;
}
else if (p >= pname_start) {
pname_end = p;
}
qstart = NULL;
qend = NULL;
}


action Param_Value_Start {
qstart = NULL;
qend = NULL;

if (pname_end) {
pvalue_start = p;
pvalue_end = NULL;
}
}


action Param_Value_End {
if (pname_end) {
if (qstart) {
pvalue_start = qstart;
}
if (qend && qend >= qstart) {
pvalue_end = qend;
}
else if (p >= pvalue_start) {
pvalue_end = p;
}
qstart = NULL;
qend = NULL;

if (pvalue_end && pvalue_end > pvalue_start && pname_end > pname_start) {
rspamd_content_type_add_param (pool, ct, pname_start, pname_end, pvalue_start, pvalue_end);
}
}

pname_start = NULL;
pname_end = NULL;
pvalue_start = NULL;
pvalue_end = NULL;
qend = NULL;
qstart = NULL;
}

action Quoted_Str_Start {
qstart = p;
qend = NULL;
}

action Quoted_Str_End {
if (qstart) {
qend = p;
}
}


include content_type "content_type.rl";

main := content_type;

}%%

#include "smtp_parsers.h"
#include "content_type.h"

%% write data;

gboolean
rspamd_content_type_parser (const char *data, size_t len, struct rspamd_content_type *ct, rspamd_mempool_t *pool)
{
const char *p = data, *pe = data + len, *eof, *qstart = NULL, *qend = NULL,
*pname_start = NULL, *pname_end = NULL, *pvalue_start, *pvalue_end;
int cs, *stack = NULL;
gsize top = 0;
struct _ragel_st_storage {
int *data;
gsize size;
} st_storage;

memset (&st_storage, 0, sizeof (st_storage));
memset (ct, 0, sizeof (*ct));
eof = pe;

%% write init;
%% write exec;

if (st_storage.data) {
free (st_storage.data);
}

return ct->type.len > 0;
}

Loading…
Cancel
Save