[Minor] Move headers parsing to a separate unit

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2016-12-13 10:06:59 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2016-12-13 10:07:22 +0000
commit: 1f5d9867a1257fe5633f8a03cd1aa09cc2fd84ee (patch)
tree: 87ac5e8b6f3d5724354e94e2f0cc155a9b1493a9 /src/libmime
parent: 14d9177ec87e420688db7a98eba6cff953eef54a (diff)
download: rspamd-1f5d9867a1257fe5633f8a03cd1aa09cc2fd84ee.tar.gz
rspamd-1f5d9867a1257fe5633f8a03cd1aa09cc2fd84ee.zip
5 files changed, 359 insertions, 305 deletions
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt
index 39bd2d402..3ec13937a 100644
--- a/src/libmime/CMakeLists.txt
+++ b/src/libmime/CMakeLists.txt
@@ -6,6 +6,7 @@ SET(LIBRSPAMDMIMESRC
 				${CMAKE_CURRENT_SOURCE_DIR}/images.c
 				${CMAKE_CURRENT_SOURCE_DIR}/message.c
 				${CMAKE_CURRENT_SOURCE_DIR}/archives.c
-				${CMAKE_CURRENT_SOURCE_DIR}/content_type.c)
+				${CMAKE_CURRENT_SOURCE_DIR}/content_type.c
+				${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c)
 
 SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
 \ No newline at end of file
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 7ed0d2c01..cb9009d7b 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -52,293 +52,6 @@ rspamd_message_quark (void)
 	return g_quark_from_static_string ("mime-error");
 }
 
-static void
-append_raw_header (struct rspamd_task *task,
-		GHashTable *target, struct raw_header *rh)
-{
-	GPtrArray *ar;
-
-	if ((ar = g_hash_table_lookup (target, rh->name)) != NULL) {
-		g_ptr_array_add (ar, rh);
-		msg_debug_task ("append raw header %s: %s", rh->name, rh->value);
-	}
-	else {
-		ar = g_ptr_array_sized_new (2);
-		g_ptr_array_add (ar, rh);
-		g_hash_table_insert (target, rh->name, ar);
-		msg_debug_task ("add new raw header %s: %s", rh->name, rh->value);
-	}
-}
-
-/* Convert raw headers to a list of struct raw_header * */
-static void
-process_raw_headers (struct rspamd_task *task, GHashTable *target,
-		const gchar *in, gsize len)
-{
-	struct raw_header *new = NULL;
-	const gchar *p, *c, *end;
-	gchar *tmp, *tp;
-	gint state = 0, l, next_state = 100, err_state = 100, t_state;
-	gboolean valid_folding = FALSE;
-	guint nlines_count[RSPAMD_TASK_NEWLINES_MAX];
-
-	p = in;
-	end = p + len;
-	c = p;
-	memset (nlines_count, 0, sizeof (nlines_count));
-	msg_debug_task ("start processing headers");
-
-	while (p < end) {
-		/* FSM for processing headers */
-		switch (state) {
-		case 0:
-			/* Begin processing headers */
-			if (!g_ascii_isalpha (*p)) {
-				/* We have some garbage at the beginning of headers, skip this line */
-				state = 100;
-				next_state = 0;
-			}
-			else {
-				state = 1;
-				c = p;
-			}
-			break;
-		case 1:
-			/* We got something like header's name */
-			if (*p == ':') {
-				new =
-					rspamd_mempool_alloc0 (task->task_pool,
-						sizeof (struct raw_header));
-				l = p - c;
-				tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
-				rspamd_strlcpy (tmp, c, l + 1);
-				new->name = tmp;
-				new->empty_separator = TRUE;
-				new->raw_value = c;
-				new->raw_len = p - c; /* Including trailing ':' */
-				p++;
-				state = 2;
-				c = p;
-			}
-			else if (g_ascii_isspace (*p)) {
-				/* Not header but some garbage */
-				task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
-				state = 100;
-				next_state = 0;
-			}
-			else {
-				p++;
-			}
-			break;
-		case 2:
-			/* We got header's name, so skip any \t or spaces */
-			if (*p == '\t') {
-				new->tab_separated = TRUE;
-				new->empty_separator = FALSE;
-				p++;
-			}
-			else if (*p == ' ') {
-				new->empty_separator = FALSE;
-				p++;
-			}
-			else if (*p == '\n' || *p == '\r') {
-
-				if (*p == '\n') {
-					nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
-				}
-				else if (*(p + 1) == '\n') {
-					nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
-				}
-				else {
-					nlines_count[RSPAMD_TASK_NEWLINES_CR] ++;
-				}
-
-				/* Process folding */
-				state = 99;
-				l = p - c;
-				if (l > 0) {
-					tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
-					rspamd_strlcpy (tmp, c, l + 1);
-					new->separator = tmp;
-				}
-				next_state = 3;
-				err_state = 5;
-				c = p;
-			}
-			else {
-				/* Process value */
-				l = p - c;
-				if (l >= 0) {
-					tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
-					rspamd_strlcpy (tmp, c, l + 1);
-					new->separator = tmp;
-				}
-				c = p;
-				state = 3;
-			}
-			break;
-		case 3:
-			if (*p == '\r' || *p == '\n') {
-				/* Hold folding */
-				if (*p == '\n') {
-					nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
-				}
-				else if (*(p + 1) == '\n') {
-					nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
-				}
-				else {
-					nlines_count[RSPAMD_TASK_NEWLINES_CR] ++;
-				}
-				state = 99;
-				next_state = 3;
-				err_state = 4;
-			}
-			else if (p + 1 == end) {
-				state = 4;
-			}
-			else {
-				p++;
-			}
-			break;
-		case 4:
-			/* Copy header's value */
-			l = p - c;
-			tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
-			tp = tmp;
-			t_state = 0;
-			while (l--) {
-				if (t_state == 0) {
-					/* Before folding */
-					if (*c == '\n' || *c == '\r') {
-						t_state = 1;
-						c++;
-						*tp++ = ' ';
-					}
-					else {
-						*tp++ = *c++;
-					}
-				}
-				else if (t_state == 1) {
-					/* Inside folding */
-					if (g_ascii_isspace (*c)) {
-						c++;
-					}
-					else {
-						t_state = 0;
-						*tp++ = *c++;
-					}
-				}
-			}
-			/* Strip last space that can be added by \r\n parsing */
-			if (*(tp - 1) == ' ') {
-				tp--;
-			}
-
-			*tp = '\0';
-			/* Strip the initial spaces that could also be added by folding */
-			while (*tmp != '\0' && g_ascii_isspace (*tmp)) {
-				tmp ++;
-			}
-
-			if (p + 1 == end) {
-				new->raw_len = end - new->raw_value;
-			}
-			else {
-				new->raw_len = p - new->raw_value;
-			}
-
-			new->value = tmp;
-			new->decoded = g_mime_utils_header_decode_text (new->value);
-
-			if (new->decoded != NULL) {
-				rspamd_mempool_add_destructor (task->task_pool,
-						(rspamd_mempool_destruct_t)g_free, new->decoded);
-			}
-			else {
-				new->decoded = "";
-			}
-
-			append_raw_header (task, target, new);
-			state = 0;
-			break;
-		case 5:
-			/* Header has only name, no value */
-			new->value = "";
-			new->decoded = "";
-			append_raw_header (task, target, new);
-			state = 0;
-			break;
-		case 99:
-			/* Folding state */
-			if (p + 1 == end) {
-				state = err_state;
-			}
-			else {
-				if (*p == '\r' || *p == '\n') {
-					p++;
-					valid_folding = FALSE;
-				}
-				else if (*p == '\t' || *p == ' ') {
-					/* Valid folding */
-					p++;
-					valid_folding = TRUE;
-				}
-				else {
-					if (valid_folding) {
-						debug_task ("go to state: %d->%d", state, next_state);
-						state = next_state;
-					}
-					else {
-						/* Fall back */
-						debug_task ("go to state: %d->%d", state, err_state);
-						state = err_state;
-					}
-				}
-			}
-			break;
-		case 100:
-			/* Fail state, skip line */
-
-			if (*p == '\r') {
-				if (*(p + 1) == '\n') {
-					nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
-					p++;
-				}
-				p++;
-				state = next_state;
-			}
-			else if (*p == '\n') {
-				nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
-
-				if (*(p + 1) == '\r') {
-					p++;
-				}
-				p++;
-				state = next_state;
-			}
-			else if (p + 1 == end) {
-				state = next_state;
-				p++;
-			}
-			else {
-				p++;
-			}
-			break;
-		}
-	}
-
-	guint max_cnt = 0;
-	gint sel = 0;
-
-	for (gint i = 0; i < RSPAMD_TASK_NEWLINES_MAX; i ++) {
-		if (nlines_count[i] > max_cnt) {
-			max_cnt = nlines_count[i];
-			sel = i;
-		}
-	}
-
-	task->nlines_type = sel;
-}
 
 static void
 free_byte_array_callback (void *pointer)
@@ -1198,8 +911,8 @@ mime_foreach_callback (GMimeObject * part, gpointer user_data)
 				rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard);
 
 		if (hdrs != NULL) {
-			process_raw_headers (task, mime_part->raw_headers,
-					hdrs, strlen (hdrs));
+			rspamd_mime_headers_process (task, mime_part->raw_headers,
+					hdrs, strlen (hdrs), FALSE);
 			mime_part->raw_headers_str = hdrs;
 		}
 
@@ -1267,8 +980,8 @@ mime_foreach_callback (GMimeObject * part, gpointer user_data)
 						rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard);
 
 				if (hdrs != NULL) {
-					process_raw_headers (task, mime_part->raw_headers,
-							hdrs, strlen (hdrs));
+					rspamd_mime_headers_process (task, mime_part->raw_headers,
+							hdrs, strlen (hdrs), FALSE);
 					mime_part->raw_headers_str = hdrs;
 				}
 
@@ -1538,9 +1251,10 @@ rspamd_message_parse (struct rspamd_task *task)
 				task->raw_headers_content.body_start = p + body_pos;
 
 				if (task->raw_headers_content.len > 0) {
-					process_raw_headers (task, task->raw_headers,
+					rspamd_mime_headers_process (task, task->raw_headers,
 							task->raw_headers_content.begin,
-							task->raw_headers_content.len);
+							task->raw_headers_content.len,
+							TRUE);
 				}
 			}
 		}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 8003d073c..ff7ebafc9 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -10,6 +10,7 @@
 #include "email_addr.h"
 #include "addr.h"
 #include "cryptobox.h"
+#include "mime_headers.h"
 #include <gmime/gmime.h>
 
 struct rspamd_task;
@@ -91,17 +92,6 @@ struct received_header {
 	enum rspamd_received_type type;
 };
 
-struct raw_header {
-	gchar *name;
-	gchar *value;
-	const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */
-	gsize raw_len;
-	gboolean tab_separated;
-	gboolean empty_separator;
-	gchar *separator;
-	gchar *decoded;
-};
-
 /**
  * Parse and pre-process mime message
  * @param task worker_task object
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
new file mode 100644
index 000000000..8ba8e6c60
--- /dev/null
+++ b/src/libmime/mime_headers.c
@@ -0,0 +1,312 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mime_headers.h"
+#include "task.h"
+
+static void
+rspamd_mime_header_add (struct rspamd_task *task,
+		GHashTable *target, struct raw_header *rh)
+{
+	GPtrArray *ar;
+
+	if ((ar = g_hash_table_lookup (target, rh->name)) != NULL) {
+		g_ptr_array_add (ar, rh);
+		msg_debug_task ("append raw header %s: %s", rh->name, rh->value);
+	}
+	else {
+		ar = g_ptr_array_sized_new (2);
+		g_ptr_array_add (ar, rh);
+		g_hash_table_insert (target, rh->name, ar);
+		msg_debug_task ("add new raw header %s: %s", rh->name, rh->value);
+	}
+}
+
+/* Convert raw headers to a list of struct raw_header * */
+void
+rspamd_mime_headers_process (struct rspamd_task *task, GHashTable *target,
+		const gchar *in, gsize len, gboolean check_newlines)
+{
+	struct raw_header *new = NULL;
+	const gchar *p, *c, *end;
+	gchar *tmp, *tp;
+	gint state = 0, l, next_state = 100, err_state = 100, t_state;
+	gboolean valid_folding = FALSE;
+	guint nlines_count[RSPAMD_TASK_NEWLINES_MAX];
+
+	p = in;
+	end = p + len;
+	c = p;
+	memset (nlines_count, 0, sizeof (nlines_count));
+	msg_debug_task ("start processing headers");
+
+	while (p < end) {
+		/* FSM for processing headers */
+		switch (state) {
+		case 0:
+			/* Begin processing headers */
+			if (!g_ascii_isalpha (*p)) {
+				/* We have some garbage at the beginning of headers, skip this line */
+				state = 100;
+				next_state = 0;
+			}
+			else {
+				state = 1;
+				c = p;
+			}
+			break;
+		case 1:
+			/* We got something like header's name */
+			if (*p == ':') {
+				new =
+					rspamd_mempool_alloc0 (task->task_pool,
+						sizeof (struct raw_header));
+				l = p - c;
+				tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
+				rspamd_strlcpy (tmp, c, l + 1);
+				new->name = tmp;
+				new->empty_separator = TRUE;
+				new->raw_value = c;
+				new->raw_len = p - c; /* Including trailing ':' */
+				p++;
+				state = 2;
+				c = p;
+			}
+			else if (g_ascii_isspace (*p)) {
+				/* Not header but some garbage */
+				task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
+				state = 100;
+				next_state = 0;
+			}
+			else {
+				p++;
+			}
+			break;
+		case 2:
+			/* We got header's name, so skip any \t or spaces */
+			if (*p == '\t') {
+				new->tab_separated = TRUE;
+				new->empty_separator = FALSE;
+				p++;
+			}
+			else if (*p == ' ') {
+				new->empty_separator = FALSE;
+				p++;
+			}
+			else if (*p == '\n' || *p == '\r') {
+
+				if (check_newlines) {
+					if (*p == '\n') {
+						nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
+					}
+					else if (*(p + 1) == '\n') {
+						nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
+					}
+					else {
+						nlines_count[RSPAMD_TASK_NEWLINES_CR] ++;
+					}
+				}
+
+				/* Process folding */
+				state = 99;
+				l = p - c;
+				if (l > 0) {
+					tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
+					rspamd_strlcpy (tmp, c, l + 1);
+					new->separator = tmp;
+				}
+				next_state = 3;
+				err_state = 5;
+				c = p;
+			}
+			else {
+				/* Process value */
+				l = p - c;
+				if (l >= 0) {
+					tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
+					rspamd_strlcpy (tmp, c, l + 1);
+					new->separator = tmp;
+				}
+				c = p;
+				state = 3;
+			}
+			break;
+		case 3:
+			if (*p == '\r' || *p == '\n') {
+				/* Hold folding */
+				if (check_newlines) {
+					if (*p == '\n') {
+						nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
+					}
+					else if (*(p + 1) == '\n') {
+						nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
+					}
+					else {
+						nlines_count[RSPAMD_TASK_NEWLINES_CR] ++;
+					}
+				}
+				state = 99;
+				next_state = 3;
+				err_state = 4;
+			}
+			else if (p + 1 == end) {
+				state = 4;
+			}
+			else {
+				p++;
+			}
+			break;
+		case 4:
+			/* Copy header's value */
+			l = p - c;
+			tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
+			tp = tmp;
+			t_state = 0;
+			while (l--) {
+				if (t_state == 0) {
+					/* Before folding */
+					if (*c == '\n' || *c == '\r') {
+						t_state = 1;
+						c++;
+						*tp++ = ' ';
+					}
+					else {
+						*tp++ = *c++;
+					}
+				}
+				else if (t_state == 1) {
+					/* Inside folding */
+					if (g_ascii_isspace (*c)) {
+						c++;
+					}
+					else {
+						t_state = 0;
+						*tp++ = *c++;
+					}
+				}
+			}
+			/* Strip last space that can be added by \r\n parsing */
+			if (*(tp - 1) == ' ') {
+				tp--;
+			}
+
+			*tp = '\0';
+			/* Strip the initial spaces that could also be added by folding */
+			while (*tmp != '\0' && g_ascii_isspace (*tmp)) {
+				tmp ++;
+			}
+
+			if (p + 1 == end) {
+				new->raw_len = end - new->raw_value;
+			}
+			else {
+				new->raw_len = p - new->raw_value;
+			}
+
+			new->value = tmp;
+			new->decoded = g_mime_utils_header_decode_text (new->value);
+
+			if (new->decoded != NULL) {
+				rspamd_mempool_add_destructor (task->task_pool,
+						(rspamd_mempool_destruct_t)g_free, new->decoded);
+			}
+			else {
+				new->decoded = "";
+			}
+
+			rspamd_mime_header_add (task, target, new);
+			state = 0;
+			break;
+		case 5:
+			/* Header has only name, no value */
+			new->value = "";
+			new->decoded = "";
+			rspamd_mime_header_add (task, target, new);
+			state = 0;
+			break;
+		case 99:
+			/* Folding state */
+			if (p + 1 == end) {
+				state = err_state;
+			}
+			else {
+				if (*p == '\r' || *p == '\n') {
+					p++;
+					valid_folding = FALSE;
+				}
+				else if (*p == '\t' || *p == ' ') {
+					/* Valid folding */
+					p++;
+					valid_folding = TRUE;
+				}
+				else {
+					if (valid_folding) {
+						debug_task ("go to state: %d->%d", state, next_state);
+						state = next_state;
+					}
+					else {
+						/* Fall back */
+						debug_task ("go to state: %d->%d", state, err_state);
+						state = err_state;
+					}
+				}
+			}
+			break;
+		case 100:
+			/* Fail state, skip line */
+
+			if (*p == '\r') {
+				if (*(p + 1) == '\n') {
+					nlines_count[RSPAMD_TASK_NEWLINES_CRLF] ++;
+					p++;
+				}
+				p++;
+				state = next_state;
+			}
+			else if (*p == '\n') {
+				nlines_count[RSPAMD_TASK_NEWLINES_LF] ++;
+
+				if (*(p + 1) == '\r') {
+					p++;
+				}
+				p++;
+				state = next_state;
+			}
+			else if (p + 1 == end) {
+				state = next_state;
+				p++;
+			}
+			else {
+				p++;
+			}
+			break;
+		}
+	}
+
+	if (check_newlines) {
+		guint max_cnt = 0;
+		gint sel = 0;
+
+		for (gint i = 0; i < RSPAMD_TASK_NEWLINES_MAX; i ++) {
+			if (nlines_count[i] > max_cnt) {
+				max_cnt = nlines_count[i];
+				sel = i;
+			}
+		}
+
+		task->nlines_type = sel;
+	}
+}
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h
new file mode 100644
index 000000000..aa76bed18
--- /dev/null
+++ b/src/libmime/mime_headers.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_MIME_HEADERS_H_
+#define SRC_LIBMIME_MIME_HEADERS_H_
+
+#include "config.h"
+
+struct rspamd_task;
+
+struct raw_header {
+	gchar *name;
+	gchar *value;
+	const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */
+	gsize raw_len;
+	gboolean tab_separated;
+	gboolean empty_separator;
+	gchar *separator;
+	gchar *decoded;
+};
+
+void rspamd_mime_headers_process (struct rspamd_task *task, GHashTable *target,
+		const gchar *in, gsize len, gboolean check_newlines);
+
+#endif /* SRC_LIBMIME_MIME_HEADERS_H_ */
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2016-12-13 10:06:59 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2016-12-13 10:07:22 +0000
commit	1f5d9867a1257fe5633f8a03cd1aa09cc2fd84ee (patch)
tree	87ac5e8b6f3d5724354e94e2f0cc155a9b1493a9 /src/libmime
parent	14d9177ec87e420688db7a98eba6cff953eef54a (diff)
download	rspamd-1f5d9867a1257fe5633f8a03cd1aa09cc2fd84ee.tar.gz rspamd-1f5d9867a1257fe5633f8a03cd1aa09cc2fd84ee.zip