mirrors
/
rspamd
mirror of https://github.com/vstakhov/rspamd.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
							/*-
 * Copyright 2016 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "libmime/content_type.h"
#include "smtp_parsers.h"
#include "utlist.h"

void
rspamd_content_type_add_param (rspamd_mempool_t *pool,
		struct rspamd_content_type *ct,
		const gchar *name_start, const gchar *name_end,
		const gchar *value_start, const gchar *value_end)
{
	rspamd_ftok_t srch;
	struct rspamd_content_type_param *found = NULL, *nparam;

	g_assert (ct != NULL);

	srch.begin = name_start;
	srch.len = name_end - name_start;

	if (ct->attrs) {
		found = g_hash_table_lookup (ct->attrs, &srch);
	}
	else {
		ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
				rspamd_ftok_icase_equal);
	}

	nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
	nparam->name.begin = name_start;
	nparam->name.len = name_end - name_start;
	nparam->value.begin = value_start;
	nparam->value.len = value_end - value_start;

	if (!found) {
		DL_APPEND (found, nparam);
		g_hash_table_insert (ct->attrs, &nparam->name, nparam);
	}
	else {
		DL_APPEND (found, nparam);
	}

	RSPAMD_FTOK_ASSIGN (&srch, "charset");

	if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
		/* Adjust charset */
		ct->charset.begin = nparam->value.begin;
		ct->charset.len = nparam->value.len;
	}

	RSPAMD_FTOK_ASSIGN (&srch, "boundary");

	if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
		/* Adjust boundary */
		ct->boundary.begin = nparam->value.begin;
		ct->boundary.len = nparam->value.len;
	}
}

static struct rspamd_content_type *
rspamd_content_type_parser (const gchar *in, gsize len, rspamd_mempool_t *pool)
{
	guint obraces = 0, ebraces = 0, qlen = 0;
	const gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL;
	struct rspamd_content_type *res = NULL, val;
	gboolean eqsign_seen = FALSE;
	enum {
		parse_type,
		parse_subtype,
		parse_after_subtype,
		parse_param_name,
		parse_param_after_name,
		parse_param_value,
		parse_param_value_after_quote,
		parse_space,
		parse_quoted,
		parse_comment,
	} state = parse_space, next_state = parse_type;

	p = in;
	c = p;
	end = p + len;
	memset (&val, 0, sizeof (val));
	val.lc_data = (gchar *)in;

	while (p < end) {
		switch (state) {
		case parse_type:
			if (g_ascii_isspace (*p) || *p == ';') {
				/* We have type without subtype */
				val.type.begin = c;
				val.type.len = p - c;
				state = parse_after_subtype;
			} else if (*p == '/') {
				val.type.begin = c;
				val.type.len = p - c;
				state = parse_space;
				next_state = parse_subtype;
				p++;
			} else {
				p++;
			}
			break;
		case parse_subtype:
			if (g_ascii_isspace (*p) || *p == ';') {
				val.subtype.begin = c;
				val.subtype.len = p - c;
				state = parse_after_subtype;
			} else {
				p++;
			}
			break;
		case parse_after_subtype:
			if (*p == ';' || g_ascii_isspace (*p)) {
				p++;
			} else if (*p == '(') {
				c = p;
				state = parse_comment;
				next_state = parse_param_name;
				obraces = 1;
				ebraces = 0;
				pname_start = NULL;
				pname_end = NULL;
				eqsign_seen = FALSE;
				p++;
			} else {
				c = p;
				state = parse_param_name;
				pname_start = NULL;
				pname_end = NULL;
				eqsign_seen = FALSE;
			}
			break;
		case parse_param_name:
			if (*p == '=') {
				pname_start = c;
				pname_end = p;
				state = parse_param_after_name;
				eqsign_seen = TRUE;
				p++;
			} else if (g_ascii_isspace (*p)) {
				pname_start = c;
				pname_end = p;
				state = parse_param_after_name;
			} else {
				p++;
			}
			break;
		case parse_param_after_name:
			if (g_ascii_isspace (*p)) {
				p++;
			} else if (*p == '=') {
				if (eqsign_seen) {
					/* Treat as value start */
					c = p;
					eqsign_seen = FALSE;
					state = parse_space;
					next_state = parse_param_value;
					p++;
				} else {
					eqsign_seen = TRUE;
					p++;
				}
			} else {
				if (eqsign_seen) {
					state = parse_param_value;
					c = p;
				} else {
					/* Invalid parameter without value */
					c = p;
					state = parse_param_name;
					pname_start = NULL;
					pname_end = NULL;
				}
			}
			break;
		case parse_param_value:
			if (*p == '"') {
				p++;
				c = p;
				state = parse_quoted;
				next_state = parse_param_value_after_quote;
			} else if (g_ascii_isspace (*p)) {
				if (pname_start && pname_end && pname_end > pname_start) {
					rspamd_content_type_add_param (pool, &val, pname_start,
							pname_end, c, p);

				}

				state = parse_space;
				next_state = parse_param_name;
				pname_start = NULL;
				pname_end = NULL;
			} else if (*p == '(') {
				if (pname_start && pname_end && pname_end > pname_start) {
					rspamd_content_type_add_param (pool, &val, pname_start,
							pname_end, c, p);
				}

				obraces = 1;
				ebraces = 0;
				p++;
				state = parse_comment;
				next_state = parse_param_name;
				pname_start = NULL;
				pname_end = NULL;
			} else {
				p++;
			}
			break;
		case parse_param_value_after_quote:
			if (pname_start && pname_end && pname_end > pname_start) {
				rspamd_content_type_add_param (pool, &val, pname_start,
						pname_end, c, c + qlen);
			}

			if (g_ascii_isspace (*p)) {
				state = parse_space;
				next_state = parse_param_name;
				pname_start = NULL;
				pname_end = NULL;
			} else if (*p == '(') {
				obraces = 1;
				ebraces = 0;
				p++;
				state = parse_comment;
				next_state = parse_param_name;
				pname_start = NULL;
				pname_end = NULL;
			} else {
				state = parse_param_name;
				pname_start = NULL;
				pname_end = NULL;
				c = p;
			}
			break;
		case parse_quoted:
			if (*p == '\\') {
				/* Quoted pair */
				if (p + 1 < end) {
					p += 2;
				} else {
					p++;
				}
			} else if (*p == '"') {
				qlen = p - c;
				state = next_state;
			} else {
				p++;
			}
			break;
		case parse_comment:
			if (*p == '(') {
				obraces++;
				p++;
			} else if (*p == ')') {
				ebraces++;
				p++;

				if (ebraces == obraces && p < end) {
					if (g_ascii_isspace (*p)) {
						state = parse_space;
					} else {
						c = p;
						state = next_state;
					}
				}
			} else {
				p++;
			}
			break;
		case parse_space:
			if (g_ascii_isspace (*p)) {
				p++;
			} else if (*p == '(') {
				obraces = 1;
				ebraces = 0;
				p++;
				state = parse_comment;
			} else {
				c = p;
				state = next_state;
			}
			break;
		}
	}

	/* Process leftover */
	switch (state) {
	case parse_type:
		val.type.begin = c;
		val.type.len = p - c;
		break;
	case parse_subtype:
		val.subtype.begin = c;
		val.subtype.len = p - c;
		break;
	case parse_param_value:
		if (pname_start && pname_end && pname_end > pname_start) {
			rspamd_content_type_add_param (pool, &val, pname_start,
					pname_end, c, p);

		}
	case parse_param_value_after_quote:
		if (pname_start && pname_end && pname_end > pname_start) {
			rspamd_content_type_add_param (pool, &val, pname_start,
					pname_end, c, c + qlen);
		}
		break;
	default:
		break;
	}

	if (val.type.len > 0) {
		res = rspamd_mempool_alloc (pool, sizeof (val));
		memcpy (res, &val, sizeof (val));
	}

	return res;
}

struct rspamd_content_type *
rspamd_content_type_parse (const gchar *in,
		gsize len, rspamd_mempool_t *pool)
{
	struct rspamd_content_type *res = NULL;
	rspamd_ftok_t srch;
	gchar *lc_data;

	lc_data = rspamd_mempool_alloc (pool, len);
	memcpy (lc_data, in, len);
	rspamd_str_lc (lc_data, len);

	if ((res = rspamd_content_type_parser (lc_data, len, pool)) != NULL) {
		if (res->attrs) {
			rspamd_mempool_add_destructor (pool,
					(rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
		}

		/* Now do some hacks to work with broken content types */
		if (res->subtype.len == 0) {
			res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
			RSPAMD_FTOK_ASSIGN (&srch, "text");

			if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
				/* Workaround for Content-Type: text */
				/* Assume text/plain */
				RSPAMD_FTOK_ASSIGN (&srch, "plain");
			}
			else {
				RSPAMD_FTOK_ASSIGN (&srch, "html");

				if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
					/* Workaround for Content-Type: html */
					RSPAMD_FTOK_ASSIGN (&res->type, "text");
					RSPAMD_FTOK_ASSIGN (&res->subtype, "html");
				}
				else {
					RSPAMD_FTOK_ASSIGN (&srch, "application");

					if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
						RSPAMD_FTOK_ASSIGN (&res->subtype, "octet-stream");
					}
				}
			}
		}
		else {
			/* Common mistake done by retards */
			RSPAMD_FTOK_ASSIGN (&srch, "alternate");

			if (rspamd_ftok_cmp (&res->subtype, &srch) == 0) {
				res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
				RSPAMD_FTOK_ASSIGN (&res->subtype, "alternative");
			}
		}

		RSPAMD_FTOK_ASSIGN (&srch, "multipart");

		if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
			res->flags |= RSPAMD_CONTENT_TYPE_MULTIPART;
		}
		else {
			RSPAMD_FTOK_ASSIGN (&srch, "text");

			if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
				res->flags |= RSPAMD_CONTENT_TYPE_TEXT;
			}
			else {
				RSPAMD_FTOK_ASSIGN (&srch, "message");

				if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
					RSPAMD_FTOK_ASSIGN (&srch, "delivery-status");

					if (rspamd_ftok_cmp (&res->subtype, &srch) == 0) {
						res->flags |= RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_DSN;
					}
					else {
						res->flags |= RSPAMD_CONTENT_TYPE_MESSAGE;
					}
				}
			}
		}
	}
	else {
		msg_warn_pool ("cannot parse content type: %*s", (gint)len, lc_data);
	}

	return res;
}

void
rspamd_content_disposition_add_param (rspamd_mempool_t *pool,
		struct rspamd_content_disposition *cd,
		const gchar *name_start, const gchar *name_end,
		const gchar *value_start, const gchar *value_end)
{
	rspamd_ftok_t srch;
	gchar *decoded;
	struct rspamd_content_type_param *found = NULL, *nparam;

	g_assert (cd != NULL);

	srch.begin = name_start;
	srch.len = name_end - name_start;

	if (cd->attrs) {
		found = g_hash_table_lookup (cd->attrs, &srch);
	}
	else {
		cd->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
				rspamd_ftok_icase_equal);
	}

	nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
	nparam->name.begin = name_start;
	nparam->name.len = name_end - name_start;
	decoded = rspamd_mime_header_decode (pool, value_start, value_end - value_start);
	RSPAMD_FTOK_FROM_STR (&nparam->value, decoded);

	if (!found) {
		g_hash_table_insert (cd->attrs, &nparam->name, nparam);
	}

	DL_APPEND (found, nparam);

	srch.begin = "filename";
	srch.len = 8;

	if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
		/* Adjust filename */
		cd->filename.begin = nparam->value.begin;
		cd->filename.len = nparam->value.len;
	}
}

struct rspamd_content_disposition *
rspamd_content_disposition_parse (const gchar *in,
		gsize len, rspamd_mempool_t *pool)
{
	struct rspamd_content_disposition *res = NULL, val;

	val.lc_data = rspamd_mempool_alloc (pool, len);
	memcpy (val.lc_data, in, len);
	rspamd_str_lc (val.lc_data, len);

	if (rspamd_content_disposition_parser (in, len, &val, pool)) {
		res = rspamd_mempool_alloc (pool, sizeof (val));
		memcpy (res, &val, sizeof (val));

		if (res->attrs) {
			rspamd_mempool_add_destructor (pool,
					(rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
		}
	}
	else {
		msg_warn_pool ("cannot parse content disposition: %*s",
				(gint)len, val.lc_data);
	}

	return res;
}