summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libmime/mime_parser.c318
1 files changed, 136 insertions, 182 deletions
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
index 00ea8e5a0..6fa78b3e0 100644
--- a/src/libmime/mime_parser.c
+++ b/src/libmime/mime_parser.c
@@ -42,14 +42,17 @@ static const guint max_key_usages = 10000;
#define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
struct rspamd_mime_boundary {
+ goffset boundary;
goffset start;
guint64 hash;
+ guint64 closed_hash;
gint flags;
};
struct rspamd_mime_parser_ctx {
GPtrArray *stack; /* Stack of parts */
GArray *boundaries; /* Boundaries found in the whole message */
+ const gchar *start;
const gchar *pos;
const gchar *end;
};
@@ -289,6 +292,7 @@ struct rspamd_mime_multipart_cbdata {
struct rspamd_mime_parser_ctx *st;
const gchar *part_start;
rspamd_ftok_t *cur_boundary;
+ guint64 bhash;
GError **err;
};
@@ -373,6 +377,7 @@ rspamd_mime_process_multipart_node (struct rspamd_task *task,
if (sel == NULL) {
/* TODO: assume part as octet-stream */
g_set_error (err, RSPAMD_MIME_QUARK, EINVAL, "no content type");
+ g_assert (0);
return FALSE;
}
@@ -396,199 +401,135 @@ rspamd_mime_process_multipart_node (struct rspamd_task *task,
return ret;
}
-static gint
-rspamd_mime_parse_multipart_cb (struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context)
+static gboolean
+rspamd_mime_parse_multipart_cb (struct rspamd_task *task,
+ struct rspamd_mime_part *multipart,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_multipart_cbdata *cb,
+ struct rspamd_mime_boundary *b)
{
- struct rspamd_mime_multipart_cbdata *cb = context;
- struct rspamd_task *task;
- const gchar *pos = text + match_pos, *end = text + len, *st;
- gint ret = 0;
- guint i, j;
- struct rspamd_mime_part *par;
+ const gchar *pos = st->start + b->boundary;
task = cb->task;
- if (cb->st->pos && pos <= cb->st->pos) {
- /* Already processed */
- return 0;
- }
-
/* Now check boundary */
if (!cb->part_start) {
- if (cb->cur_boundary) {
- if (match_pos + cb->cur_boundary->len < len) {
- if (rspamd_lc_cmp (pos, cb->cur_boundary->begin,
- cb->cur_boundary->len) != 0) {
- msg_debug_mime ("found invalid boundary: %*s, %T expected",
- (gint)cb->cur_boundary->len, pos, cb->cur_boundary);
-
- /* Just continue search */
- return 0;
- }
-
- pos += cb->cur_boundary->len;
+ cb->part_start = st->start + b->start;
+ st->pos = cb->part_start;
+ }
+ else {
+ /* We have seen the start of the boundary */
+ if (cb->part_start < pos) {
+ /* We should have seen some boundary */
+ g_assert (cb->cur_boundary != NULL);
- while (pos < end && (*pos == '\r' || *pos == '\n')) {
- pos ++;
- }
- cb->part_start = pos;
- cb->st->pos = pos;
+ if (!rspamd_mime_process_multipart_node (task, cb->st,
+ cb->multipart, cb->part_start, pos, cb->err)) {
+ return FALSE;
}
- else {
- msg_debug_mime ("boundary is stripped");
- g_set_error (cb->err, RSPAMD_MIME_QUARK, EINVAL,
- "start boundary is stripped at %d (%zd available)",
- match_pos, len);
- return (-1);
- }
+ /* Go towards the next part */
+ cb->part_start = st->start + b->start;
+ cb->st->pos = cb->part_start;
}
else {
- /* We see something like boundary: '[\r\n]--xxx */
- /* TODO: write heuristic */
g_assert_not_reached ();
}
}
- else {
- /* We have seen the start of the boundary */
- if (cb->part_start < pos) {
- /* We should have seen some boundary */
- g_assert (cb->cur_boundary != NULL);
- if (pos == end) {
- return 1;
- /* We have part without ending line, assume it fine */
- st = match_pos + text - 2;
+ return TRUE;
+}
- if (!rspamd_mime_process_multipart_node (task, cb->st,
- cb->multipart, cb->part_start, st, cb->err)) {
- return -1;
- }
+static gint
+rspamd_multipart_boundaries_filter (struct rspamd_task *task,
+ struct rspamd_mime_part *multipart,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_multipart_cbdata *cb)
+{
+ struct rspamd_mime_boundary *cur;
+ goffset last_offset;
+ guint i, sel = 0;
- while (pos < end && (*pos == '\r' || *pos == '\n')) {
- pos ++;
- }
+ last_offset = (multipart->raw_data.begin - st->start) +
+ multipart->raw_data.len;
- /* Go towards the next part */
- cb->part_start = pos;
- cb->st->pos = pos;
- }
+ /* Find the first offset suitable for this part */
+ for (i = 0; i < st->boundaries->len; i ++) {
+ cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
- if (match_pos + cb->cur_boundary->len > len ||
- rspamd_lc_cmp (pos, cb->cur_boundary->begin,
- cb->cur_boundary->len) != 0) {
- msg_debug_mime ("found invalid boundary: %*s, %T expected",
- (gint)cb->cur_boundary->len, pos, cb->cur_boundary);
-
- /*
- * We also need to check parent parts:
- *
- * --1
- * --1.1
- * --1.1 <- this one is closed implicitly by the next
- * --1--
- */
- if (cb->st->stack->len > 0) {
- for (i = cb->st->stack->len - 1; ; i --) {
- par = g_ptr_array_index (cb->st->stack, i);
-
- if (par->ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- if (par->ct->boundary.len > 0 &&
- end - pos >= par->ct->boundary.len &&
- rspamd_lc_cmp (pos,
- par->ct->boundary.begin,
- par->ct->boundary.len) == 0) {
- pos += par->ct->boundary.len + 1;
-
- /* Now we need to check for:
- * 1. --\r|\n
- * 2. \r|\n
- * 3. --<EOF>
- */
- ret = 0;
-
- if (pos == end || (*pos == '\r' || *pos == '\n')) {
- ret = 1;
- }
- else if (pos < end - 1) {
- if (pos[0] == '-' && pos[1] == '-') {
- pos += 2;
- if (pos == end ||
- (*pos == '\r' || *pos == '\n')) {
- ret = 1;
- }
- }
- }
-
- if (ret) {
- /* Unwind parts stack to this part */
- for (j = cb->st->stack->len - 1; j > i; j ++) {
- g_ptr_array_remove_index_fast (
- cb->st->stack, j);
- }
-
- g_ptr_array_remove_index_fast (cb->st->stack, i);
- }
-
- break;
- }
- }
-
- if (i == 0) {
- break;
- }
- }
+ if (cur->start >= multipart->raw_data.begin - st->start) {
+ if (cb->cur_boundary) {
+ /* Check boundary */
+ msg_debug_mime ("compare %L and %L", cb->bhash, cur->hash);
+
+ if (cb->bhash == cur->hash) {
+ sel = i;
+ break;
}
- if (!ret) {
- /* Not a valid boundary */
- return 0;
+ else if (cb->bhash == cur->closed_hash) {
+ /* Not a closing element in fact */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ sel = i;
+ break;
}
}
else {
- pos += cb->cur_boundary->len;
- cb->st->pos = pos;
-
- if (pos < end - 1 && pos[0] == '-' && pos[1] == '-') {
- /* It should be end of multipart, but it is sometimes isn't */
- /* TODO: deal with such perversions */
- pos += 2;
- ret = 1;
- }
- else if (pos[0] != '\r' && pos[0] != '\n' && pos != end) {
- /* This is not actually our boundary, but something else */
- return 0;
- }
+ /* Set current boundary */
+ cb->cur_boundary = rspamd_mempool_alloc (task->task_pool,
+ sizeof (rspamd_ftok_t));
+ cb->cur_boundary->begin = st->start + cur->boundary;
+ cb->cur_boundary->len = 0;
+ cb->bhash = cur->hash;
+ sel = i;
+ break;
+ }
+ }
+ }
- st = match_pos + text - 2;
+ /* Now we can go forward with boundaries that are same to what we have */
+ for (i = sel; i < st->boundaries->len; i ++) {
+ cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
- if (!rspamd_mime_process_multipart_node (task, cb->st,
- cb->multipart, cb->part_start, st, cb->err)) {
- return -1;
- }
+ if (cur->boundary > last_offset) {
+ break;
+ }
+
+ if (cur->hash == cb->bhash) {
+ if (!rspamd_mime_parse_multipart_cb (task, multipart, st,
+ cb, cur)) {
+ return FALSE;
+ }
- while (pos < end && (*pos == '\r' || *pos == '\n')) {
- pos ++;
+ if (RSPAMD_BOUNDARY_IS_CLOSED (cur)) {
+ /* We also might check the next boundary... */
+ if (i < st->boundaries->len - 1) {
+ cur = &g_array_index (st->boundaries,
+ struct rspamd_mime_boundary, i + 1);
+
+ if (cur->hash == cb->bhash) {
+ continue;
+ }
}
- /* Go towards the next part */
- cb->part_start = pos;
- cb->st->pos = pos;
+ break;
}
}
- else {
- /* We have something very bad in fact */
- g_assert_not_reached ();
+ }
+
+ if (i == st->boundaries->len && cb->cur_boundary) {
+ /* Process the last part */
+ struct rspamd_mime_boundary fb;
+
+ fb.boundary = last_offset;
+
+ if (!rspamd_mime_parse_multipart_cb (task, multipart, st,
+ cb, &fb)) {
+ return FALSE;
}
}
- return ret;
+ return TRUE;
}
static gboolean
@@ -598,7 +539,7 @@ rspamd_mime_parse_multipart_part (struct rspamd_task *task,
GError **err)
{
struct rspamd_mime_multipart_cbdata cbdata;
- gint ret;
+ gboolean ret;
if (st->stack->len > max_nested) {
g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
@@ -618,29 +559,22 @@ rspamd_mime_parse_multipart_part (struct rspamd_task *task,
if (part->ct->boundary.len > 0) {
/* We know our boundary */
cbdata.cur_boundary = &part->ct->boundary;
+ rspamd_cryptobox_siphash ((guchar *)&cbdata.bhash,
+ cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
+ lib_ctx->hkey);
+ msg_debug_mime ("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
}
else {
/* Guess boundary */
cbdata.cur_boundary = NULL;
+ cbdata.bhash = 0;
}
- ret = rspamd_multipattern_lookup (lib_ctx->mp_boundary,
- part->raw_data.begin - 1,
- part->raw_data.len + 1,
- rspamd_mime_parse_multipart_cb, &cbdata, NULL);
-
- if (st->pos < part->raw_data.begin + part->raw_data.len) {
- if (!rspamd_mime_process_multipart_node (task, st,
- part, cbdata.part_start,
- part->raw_data.begin + part->raw_data.len, err)) {
- return -1;
- }
- }
-
+ ret = rspamd_multipart_boundaries_filter (task, part, st, &cbdata);
/* Cleanup stack */
g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
- return (ret != -1);
+ return ret;
}
/* Process boundary like structures in a message */
@@ -653,7 +587,8 @@ rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
gsize len,
void *context)
{
- const gchar *end = text + len, *p = text + match_pos, *bend, *pend;
+ const gchar *end = text + len, *p = text + match_pos, *bend;
+ gchar *lc_copy;
gsize blen;
gboolean closing = FALSE;
struct rspamd_mime_boundary b;
@@ -665,13 +600,13 @@ rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
if (blen > 0) {
/* We have found something like boundary */
bend = p + blen - 1;
- pend = p + blen;
if (*bend == '-') {
/* We need to verify last -- */
if (bend > p + 1 && *(bend - 1) == '-') {
closing = TRUE;
bend --;
+ blen -= 2;
}
else {
/* Not a closing boundary somehow */
@@ -695,12 +630,20 @@ rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
bend ++;
}
- b.start = bend - text;
- rspamd_cryptobox_siphash ((guchar *)&b.hash, p, bend - p,
+ b.boundary = p - text - 3;
+ b.start = bend - text - 1;
+
+ lc_copy = g_malloc (blen);
+ memcpy (lc_copy, p, blen);
+ rspamd_str_lc (lc_copy, blen);
+ rspamd_cryptobox_siphash ((guchar *)&b.hash, lc_copy, blen,
lib_ctx->hkey);
+ g_free (lc_copy);
if (closing) {
b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
+ rspamd_cryptobox_siphash ((guchar *)&b.closed_hash, p, blen + 2,
+ lib_ctx->hkey);
}
else {
b.flags = 0;
@@ -718,10 +661,19 @@ rspamd_mime_preprocess_message (struct rspamd_task *task,
struct rspamd_mime_part *top,
struct rspamd_mime_parser_ctx *st)
{
- rspamd_multipattern_lookup (lib_ctx->mp_boundary,
- top->raw_data.begin - 1,
- top->raw_data.len + 1,
- rspamd_mime_preprocess_cb, st, NULL);
+
+ if (top->raw_data.begin >= st->pos) {
+ rspamd_multipattern_lookup (lib_ctx->mp_boundary,
+ top->raw_data.begin - 1,
+ top->raw_data.len + 1,
+ rspamd_mime_preprocess_cb, st, NULL);
+ }
+ else {
+ rspamd_multipattern_lookup (lib_ctx->mp_boundary,
+ st->pos,
+ st->end - st->pos,
+ rspamd_mime_preprocess_cb, st, NULL);
+ }
}
static gboolean
@@ -930,6 +882,7 @@ rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
if (++lib_ctx->key_usages > max_key_usages) {
/* Regenerate siphash key */
ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
+ lib_ctx->key_usages = 0;
}
st = g_slice_alloc0 (sizeof (*st));
@@ -943,6 +896,7 @@ rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
st->pos = task->msg.begin;
}
+ st->start = task->msg.begin;
ret = rspamd_mime_parse_message (task, NULL, st, err);
rspamd_mime_parse_stack_free (st);