Reorganize statfiles and classifiers into libstat.

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-01-16 15:28:40 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-01-16 15:28:40 +0000
commit: b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch)
tree: a647a4306708df37a3ea1d97666fd2d325e24464 /src/libstat/tokenizers
parent: ffd95d7c71307bb9540f07bbaac3b04859226837 (diff)
download: rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz
rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip
2 files changed, 382 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
new file mode 100644
index 000000000..9dd12a8dd
--- /dev/null
+++ b/src/libstat/tokenizers/osb.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Minimum length of token */
+#define MIN_LEN 4
+
+extern const int primes[];
+
+int
+osb_tokenize_text (struct tokenizer *tokenizer,
+	rspamd_mempool_t * pool,
+	GArray * input,
+	GTree ** tree,
+	gboolean save_token,
+	gboolean is_utf,
+	GList *exceptions)
+{
+	token_node_t *new = NULL;
+	rspamd_fstring_t *token;
+	guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+	gint i, processed = 0;
+	guint w;
+
+	if (input == NULL) {
+		return FALSE;
+	}
+
+	if (*tree == NULL) {
+		*tree = g_tree_new (token_node_compare_func);
+		rspamd_mempool_add_destructor (pool,
+			(rspamd_mempool_destruct_t) g_tree_destroy,
+			*tree);
+	}
+
+	memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+
+	for (w = 0; w < input->len; w ++) {
+		token = &g_array_index (input, rspamd_fstring_t, w);
+
+		if (processed < FEATURE_WINDOW_SIZE) {
+			/* Just fill a hashpipe */
+			hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
+				rspamd_fstrhash_lc (token, is_utf);
+		}
+		else {
+			/* Shift hashpipe */
+			for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+				hashpipe[i] = hashpipe[i - 1];
+			}
+			hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+			processed++;
+
+			for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
+				h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+				h2 = hashpipe[0] * primes[1] + hashpipe[i] *
+					primes[(i << 1) - 1];
+				new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+				new->h1 = h1;
+				new->h2 = h2;
+				if (save_token) {
+					new->extra =
+						(uintptr_t)rspamd_mempool_fstrdup (pool, token);
+				}
+
+				if (g_tree_lookup (*tree, new) == NULL) {
+					g_tree_insert (*tree, new, new);
+				}
+			}
+		}
+	}
+
+	if (processed <= FEATURE_WINDOW_SIZE) {
+		for (i = 1; i < processed; i++) {
+			h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+			h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+			new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+			new->h1 = h1;
+			new->h2 = h2;
+			if (save_token) {
+				new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
+			}
+
+			if (g_tree_lookup (*tree, new) == NULL) {
+				g_tree_insert (*tree, new, new);
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
new file mode 100644
index 000000000..3e6c745ec
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "main.h"
+#include "tokenizers.h"
+
+struct tokenizer tokenizers[] = {
+	{"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
+};
+
+const int primes[] = {
+	1, 7,
+	3, 13,
+	5, 29,
+	11, 51,
+	23, 101,
+	47, 203,
+	97, 407,
+	197, 817,
+	397, 1637,
+	797, 3277,
+};
+
+const gchar t_delimiters[255] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+	1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+	1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0
+};
+
+struct tokenizer *
+get_tokenizer (const char *name)
+{
+	guint i;
+
+	for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
+		if (strcmp (tokenizers[i].name, name) == 0) {
+			return &tokenizers[i];
+		}
+	}
+
+	return NULL;
+}
+
+int
+token_node_compare_func (gconstpointer a, gconstpointer b)
+{
+	const token_node_t *aa = a, *bb = b;
+
+	if (aa->h1 == bb->h1) {
+		return aa->h2 - bb->h2;
+	}
+
+	return aa->h1 - bb->h1;
+}
+
+/* Get next word from specified f_str_t buf */
+gchar *
+rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
+{
+	gsize remain, pos;
+	guchar *p;
+	struct process_exception *ex = NULL;
+
+	if (buf == NULL) {
+		return NULL;
+	}
+
+	if (exceptions != NULL && *exceptions != NULL) {
+		ex = (*exceptions)->data;
+	}
+
+	if (token->begin == NULL) {
+		if (ex != NULL) {
+			if (ex->pos == 0) {
+				token->begin = buf->begin + ex->len;
+				token->len = ex->len;
+			}
+			else {
+				token->begin = buf->begin;
+				token->len = 0;
+			}
+		}
+		else {
+			token->begin = buf->begin;
+			token->len = 0;
+		}
+	}
+
+	token->len = 0;
+
+	pos = token->begin - buf->begin;
+	if (pos >= buf->len) {
+		return NULL;
+	}
+
+	remain = buf->len - pos;
+	p = token->begin;
+	/* Skip non delimiters symbols */
+	do {
+		if (ex != NULL && ex->pos == pos) {
+			/* Go to the next exception */
+			*exceptions = g_list_next (*exceptions);
+			return p + ex->len;
+		}
+		pos++;
+		p++;
+		remain--;
+	} while (remain > 0 && t_delimiters[*p]);
+
+	token->begin = p;
+
+	while (remain > 0 && !t_delimiters[*p]) {
+		if (ex != NULL && ex->pos == pos) {
+			*exceptions = g_list_next (*exceptions);
+			return p + ex->len;
+		}
+		token->len++;
+		pos++;
+		remain--;
+		p++;
+	}
+
+	if (remain == 0) {
+		return NULL;
+	}
+
+	return p;
+}
+
+GArray *
+rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+		gsize min_len, GList **exceptions)
+{
+	rspamd_fstring_t token, buf;
+	gchar *pos;
+	gsize l;
+	GArray *res;
+
+	if (len == 0 || text == NULL) {
+		return NULL;
+	}
+
+	buf.begin = text;
+	buf.len = len;
+	buf.size = buf.len;
+	token.begin = NULL;
+	token.len = 0;
+
+	res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+	while ((pos = rspamd_tokenizer_get_word (&buf,
+			&token, exceptions)) != NULL) {
+		if (is_utf) {
+			l = g_utf8_strlen (token.begin, token.len);
+		}
+		else {
+			l = token.len;
+		}
+		if (min_len > 0 && l < min_len) {
+			token.begin = pos;
+			continue;
+		}
+		g_array_append_val (res, token);
+
+		token.begin = pos;
+	}
+
+	return res;
+}
+
+
+void
+tokenize_subject (struct rspamd_task *task, GTree ** tree)
+{
+	gchar *sub;
+	struct tokenizer *osb_tokenizer;
+	GArray *words;
+
+	if (*tree == NULL) {
+		*tree = g_tree_new (token_node_compare_func);
+		rspamd_mempool_add_destructor (task->task_pool,
+			(rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+	}
+
+	osb_tokenizer = get_tokenizer ("osb-text");
+
+	/* Try to use pre-defined subject */
+	if (task->subject != NULL) {
+		sub = task->subject;
+	}
+	else {
+		sub = (gchar *)g_mime_message_get_subject (task->message);
+	}
+
+	if (sub != NULL) {
+		words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+		if (words != NULL) {
+			osb_tokenizer->tokenize_func (osb_tokenizer,
+					task->task_pool,
+					words,
+					tree,
+					FALSE,
+					TRUE,
+					NULL);
+			g_array_free (words, TRUE);
+		}
+	}
+}
+
+/*
+ * vi:ts=4
+ */
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-01-16 15:28:40 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-01-16 15:28:40 +0000
commit	b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch)
tree	a647a4306708df37a3ea1d97666fd2d325e24464 /src/libstat/tokenizers
parent	ffd95d7c71307bb9540f07bbaac3b04859226837 (diff)
download	rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip