[Rework] Rework exceptions and newlines processing

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2016-07-13 17:03:27 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2016-07-13 17:07:23 +0100
commit: 70cbb6d39a06eb6f71832517bfd788ad217b6965 (patch)
tree: 5e0e41033565b271021072aa5c2455f0e79a91a7
parent: d2af2a1d52a8f9b26b7c77b12ce555db24f07df4 (diff)
download: rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.tar.gz
rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.zip
7 files changed, 126 insertions, 87 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index f6c023294..4605d1c69 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part)
 }
 
 static void
-rspamd_normalize_text_part (struct rspamd_task *task,
+rspamd_extract_words (struct rspamd_task *task,
 		struct rspamd_mime_text_part *part)
 {
 #ifdef WITH_SNOWBALL
 	struct sb_stemmer *stem = NULL;
 #endif
 	rspamd_ftok_t *w;
-	const guchar *r, *p, *c, *end;
 	gchar *temp_word;
+	const guchar *r;
 	guint i, nlen;
 
 #ifdef WITH_SNOWBALL
@@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 		stem = sb_stemmer_new (part->language, "UTF_8");
 		if (stem == NULL) {
 			msg_info_task ("<%s> cannot create lemmatizer for %s language",
-				task->message_id, part->language);
+					task->message_id, part->language);
 		}
 	}
 #endif
-	/* Strip newlines */
-	part->stripped_content = g_byte_array_sized_new (part->content->len);
-	part->newlines = g_ptr_array_sized_new (128);
-	p = part->content->data;
-	c = p;
-	end = p + part->content->len;
-
-	while (p < end) {
-		p = memchr (c, '\n', end - c);
-
-		if (p) {
-			if (*(p - 1) == '\r') {
-				p --;
-			}
-
-			if (p > c) {
-				g_byte_array_append (part->stripped_content, c, p - c);
-			}
-
-			/* As it could cause reallocation, we initially store offsets */
-			g_ptr_array_add (part->newlines,
-					GUINT_TO_POINTER (part->stripped_content->len));
-			part->nlines ++;
-			p ++;
-
-			while (p < end && (*p == '\r' || *p == '\n')) {
-				if (*p == '\n') {
-					part->nlines ++;
-				}
-
-				p ++;
-			}
-			c = p;
-		}
-		else {
-			p = end;
-			break;
-		}
-	}
-
-	if (p > c) {
-		g_byte_array_append (part->stripped_content, c, p - c);
-	}
-
-	/* Now convert offsets to real pointers for convenience */
-	for (i = 0; i < part->newlines->len; i ++) {
-		guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
-		g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
-	}
-
-	rspamd_mempool_add_destructor (task->task_pool,
-			(rspamd_mempool_destruct_t) free_byte_array_callback,
-			part->stripped_content);
-	rspamd_mempool_add_destructor (task->task_pool,
-			(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
-			part->newlines);
-
 	/* Ugly workaround */
 	part->normalized_words = rspamd_tokenize_text (part->content->data,
 			part->content->len, IS_PART_UTF (part), task->cfg,
-			part->urls_offset, FALSE,
+			part->exceptions, FALSE,
 			NULL);
 
 	if (part->normalized_words) {
@@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 #endif
 }
 
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+		struct rspamd_mime_text_part *part)
+{
+
+	const guchar *p, *c, *end;
+	guint i;
+	struct rspamd_process_exception *ex;
+
+	/* Strip newlines */
+	part->stripped_content = g_byte_array_sized_new (part->content->len);
+	part->newlines = g_ptr_array_sized_new (128);
+	p = part->content->data;
+	c = p;
+	end = p + part->content->len;
+
+	while (p < end) {
+		p = memchr (c, '\n', end - c);
+
+		if (p) {
+			if (*(p - 1) == '\r') {
+				p --;
+			}
+
+			if (p > c) {
+				g_byte_array_append (part->stripped_content, c, p - c);
+			}
+
+			/* As it could cause reallocation, we initially store offsets */
+			g_ptr_array_add (part->newlines,
+					GUINT_TO_POINTER (part->stripped_content->len));
+			ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+			ex->pos = part->stripped_content->len;
+			ex->len = 0;
+			ex->type = RSPAMD_EXCEPTION_NEWLINE;
+			part->exceptions = g_list_prepend (part->exceptions, ex);
+			part->nlines ++;
+			p ++;
+
+			while (p < end && (*p == '\r' || *p == '\n')) {
+				if (*p == '\n') {
+					part->nlines ++;
+				}
+
+				p ++;
+			}
+			c = p;
+		}
+		else {
+			p = end;
+			break;
+		}
+	}
+
+	if (p > c) {
+		g_byte_array_append (part->stripped_content, c, p - c);
+	}
+
+	/* Now convert offsets to real pointers for convenience */
+	for (i = 0; i < part->newlines->len; i ++) {
+		guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+		g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+	}
+
+	rspamd_mempool_add_destructor (task->task_pool,
+			(rspamd_mempool_destruct_t) free_byte_array_callback,
+			part->stripped_content);
+	rspamd_mempool_add_destructor (task->task_pool,
+			(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+			part->newlines);
+}
+
 #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
 
 static guint
@@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
 	return FALSE;
 }
 
+static gint
+exceptions_compare_func (gconstpointer a, gconstpointer b)
+{
+	const struct rspamd_process_exception *ea = a, *eb = b;
+
+	return ea->pos - eb->pos;
+}
+
 static void
 process_text_part (struct rspamd_task *task,
 	GByteArray *part_content,
@@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task,
 				task->task_pool,
 				text_part->html,
 				part_content,
-				&text_part->urls_offset,
+				&text_part->exceptions,
 				task->urls,
 				task->emails);
 
@@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task,
 		}
 
 		/* Handle offsets of this part */
-		if (text_part->urls_offset != NULL) {
-			text_part->urls_offset = g_list_reverse (text_part->urls_offset);
+		if (text_part->exceptions != NULL) {
+			text_part->exceptions = g_list_reverse (text_part->exceptions);
 			rspamd_mempool_add_destructor (task->task_pool,
-					(rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
+					(rspamd_mempool_destruct_t) g_list_free, text_part->exceptions);
 		}
 
 		rspamd_mempool_add_destructor (task->task_pool,
@@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task,
 	if (!IS_PART_HTML (text_part)) {
 		rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
 	}
+
+	text_part->exceptions = g_list_sort (text_part->exceptions,
+			exceptions_compare_func);
+
+	rspamd_extract_words (task, text_part);
 }
 
 struct mime_foreach_data {
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 0d2ae74b4..3fe26e685 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -57,7 +57,7 @@ struct rspamd_mime_text_part {
 	GByteArray *stripped_content; /**< no newlines or html tags 			*/
 	GPtrArray *newlines;	/**< positions of newlines in text					*/
 	struct html_content *html;
-	GList *urls_offset;	/**< list of offsets of urls						*/
+	GList *exceptions;	/**< list of offsets of urls						*/
 	GMimeObject *parent;
 	struct rspamd_mime_part *mime_part;
 	GArray *normalized_words;
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 0a25e488a..1188515c5 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1601,7 +1601,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 	gint substate = 0, len, href_offset = -1;
 	struct html_tag *cur_tag = NULL;
 	struct rspamd_url *url = NULL, *turl;
-	struct process_exception *ex;
+	struct rspamd_process_exception *ex;
 	enum {
 		parse_start = 0,
 		tag_begin,
@@ -1977,6 +1977,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 								ex = rspamd_mempool_alloc (pool, sizeof (*ex));
 								ex->pos = href_offset;
 								ex->len = dest->len - href_offset;
+								ex->type = RSPAMD_EXCEPTION_URL;
 
 								*exceptions = g_list_prepend (*exceptions, ex);
 							}
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 823e32a43..1ccc91a27 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2294,17 +2294,18 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 		gsize end_offset, gpointer ud)
 {
 	struct rspamd_url_mimepart_cbdata *cbd = ud;
-	struct process_exception *ex;
+	struct rspamd_process_exception *ex;
 	struct rspamd_task *task;
 	gchar *url_str = NULL;
 	struct rspamd_url *query_url;
 	gint rc;
 
 	task = cbd->task;
-	ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+	ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_process_exception));
 
 	ex->pos = start_offset;
 	ex->len = end_offset - start_offset;
+	ex->type = RSPAMD_EXCEPTION_URL;
 
 	if (url->protocol == PROTOCOL_MAILTO) {
 		if (url->userlen > 0) {
@@ -2320,8 +2321,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 		}
 	}
 
-	cbd->part->urls_offset = g_list_prepend (
-			cbd->part->urls_offset,
+	cbd->part->exceptions = g_list_prepend (
+			cbd->part->exceptions,
 			ex);
 
 	/* We also search the query for additional url inside */
@@ -2376,10 +2377,10 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
 			rspamd_url_text_part_callback, &mcbd);
 
 	/* Handle offsets of this part */
-	if (part->urls_offset != NULL) {
-		part->urls_offset = g_list_reverse (part->urls_offset);
+	if (part->exceptions != NULL) {
+		part->exceptions = g_list_reverse (part->exceptions);
 		rspamd_mempool_add_destructor (task->task_pool,
-				(rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+				(rspamd_mempool_destruct_t) g_list_free, part->exceptions);
 	}
 }
 
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 4e0e4b75d..6eab11f98 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -75,7 +75,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
 {
 	gsize remain, pos;
 	const gchar *p;
-	struct process_exception *ex = NULL;
+	struct rspamd_process_exception *ex = NULL;
 
 	if (buf == NULL) {
 		return FALSE;
@@ -166,11 +166,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
 		GList **exceptions, gboolean is_utf, gsize *rl,
 		gboolean check_signature)
 {
-	gsize remain, pos, siglen = 0;
+	gsize remain, siglen = 0;
+	goffset pos;
 	const gchar *p, *next_p, *sig = NULL;
 	gunichar uc;
 	guint processed = 0;
-	struct process_exception *ex = NULL;
+	struct rspamd_process_exception *ex = NULL;
 	enum {
 		skip_delimiters = 0,
 		feed_token,
@@ -214,10 +215,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
 
 		switch (state) {
 		case skip_delimiters:
-			if (ex != NULL && p - buf->begin == (gint)ex->pos) {
-				token->begin = "!!EX!!";
-				token->len = sizeof ("!!EX!!") - 1;
-				processed = token->len;
+			if (ex != NULL && p - buf->begin == ex->pos) {
+				if (ex->type == RSPAMD_EXCEPTION_URL) {
+					token->begin = "!!EX!!";
+					token->len = sizeof ("!!EX!!") - 1;
+					processed = token->len;
+				}
 				state = skip_exception;
 				continue;
 			}
@@ -270,12 +273,13 @@ set_token:
 		*rl = processed;
 	}
 
-	if (token->len == 0) {
+	if (token->len == 0 && processed > 0) {
 		token->len = p - token->begin;
 		g_assert (token->len > 0);
-		*cur = p;
 	}
 
+	*cur = p;
+
 	return TRUE;
 }
 
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index ccbcec6e6..6ce4179f1 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -733,7 +733,7 @@ lua_util_tokenize_text (lua_State *L)
 	gsize len, pos, ex_len, i;
 	GList *exceptions = NULL, *cur;
 	struct rspamd_lua_text *t;
-	struct process_exception *ex;
+	struct rspamd_process_exception *ex;
 	GArray *res;
 	rspamd_ftok_t *w;
 	gboolean compat = FALSE;
diff --git a/src/rspamd.h b/src/rspamd.h
index 5626337fc..ffebfe387 100644
--- a/src/rspamd.h
+++ b/src/rspamd.h
@@ -265,12 +265,17 @@ struct rspamd_main {
 	struct event_base *ev_base;
 };
 
+enum rspamd_exception_type {
+	RSPAMD_EXCEPTION_NEWLINE = 0,
+	RSPAMD_EXCEPTION_URL,
+};
 /**
  * Structure to point exception in text from processing
  */
-struct process_exception {
-	gsize pos;
-	gsize len;
+struct rspamd_process_exception {
+	goffset pos;
+	guint len;
+	enum rspamd_exception_type type;
 };
 
 /**
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2016-07-13 17:03:27 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2016-07-13 17:07:23 +0100
commit	70cbb6d39a06eb6f71832517bfd788ad217b6965 (patch)
tree	5e0e41033565b271021072aa5c2455f0e79a91a7
parent	d2af2a1d52a8f9b26b7c77b12ce555db24f07df4 (diff)
download	rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.tar.gz rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.zip