Reorganize statfiles and classifiers into libstat.

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-01-16 15:28:40 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-01-16 15:28:40 +0000
commit: b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch)
tree: a647a4306708df37a3ea1d97666fd2d325e24464 /src/tokenizers
parent: ffd95d7c71307bb9540f07bbaac3b04859226837 (diff)
download: rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz
rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip
3 files changed, 0 insertions, 446 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
deleted file mode 100644
index 9dd12a8dd..000000000
--- a/src/tokenizers/osb.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * OSB tokenizer
- */
-
-#include <sys/types.h>
-#include "tokenizers.h"
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
-
-int
-osb_tokenize_text (struct tokenizer *tokenizer,
-	rspamd_mempool_t * pool,
-	GArray * input,
-	GTree ** tree,
-	gboolean save_token,
-	gboolean is_utf,
-	GList *exceptions)
-{
-	token_node_t *new = NULL;
-	rspamd_fstring_t *token;
-	guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
-	gint i, processed = 0;
-	guint w;
-
-	if (input == NULL) {
-		return FALSE;
-	}
-
-	if (*tree == NULL) {
-		*tree = g_tree_new (token_node_compare_func);
-		rspamd_mempool_add_destructor (pool,
-			(rspamd_mempool_destruct_t) g_tree_destroy,
-			*tree);
-	}
-
-	memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
-
-	for (w = 0; w < input->len; w ++) {
-		token = &g_array_index (input, rspamd_fstring_t, w);
-
-		if (processed < FEATURE_WINDOW_SIZE) {
-			/* Just fill a hashpipe */
-			hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
-				rspamd_fstrhash_lc (token, is_utf);
-		}
-		else {
-			/* Shift hashpipe */
-			for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
-				hashpipe[i] = hashpipe[i - 1];
-			}
-			hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
-			processed++;
-
-			for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
-				h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-				h2 = hashpipe[0] * primes[1] + hashpipe[i] *
-					primes[(i << 1) - 1];
-				new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
-				new->h1 = h1;
-				new->h2 = h2;
-				if (save_token) {
-					new->extra =
-						(uintptr_t)rspamd_mempool_fstrdup (pool, token);
-				}
-
-				if (g_tree_lookup (*tree, new) == NULL) {
-					g_tree_insert (*tree, new, new);
-				}
-			}
-		}
-	}
-
-	if (processed <= FEATURE_WINDOW_SIZE) {
-		for (i = 1; i < processed; i++) {
-			h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-			h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
-			new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
-			new->h1 = h1;
-			new->h2 = h2;
-			if (save_token) {
-				new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
-			}
-
-			if (g_tree_lookup (*tree, new) == NULL) {
-				g_tree_insert (*tree, new, new);
-			}
-		}
-	}
-
-	return TRUE;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
deleted file mode 100644
index 3e6c745ec..000000000
--- a/src/tokenizers/tokenizers.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Common tokenization functions
- */
-
-#include <sys/types.h>
-#include "main.h"
-#include "tokenizers.h"
-
-struct tokenizer tokenizers[] = {
-	{"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
-};
-
-const int primes[] = {
-	1, 7,
-	3, 13,
-	5, 29,
-	11, 51,
-	23, 101,
-	47, 203,
-	97, 407,
-	197, 817,
-	397, 1637,
-	797, 3277,
-};
-
-const gchar t_delimiters[255] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-	1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
-	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-	1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0
-};
-
-struct tokenizer *
-get_tokenizer (const char *name)
-{
-	guint i;
-
-	for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
-		if (strcmp (tokenizers[i].name, name) == 0) {
-			return &tokenizers[i];
-		}
-	}
-
-	return NULL;
-}
-
-int
-token_node_compare_func (gconstpointer a, gconstpointer b)
-{
-	const token_node_t *aa = a, *bb = b;
-
-	if (aa->h1 == bb->h1) {
-		return aa->h2 - bb->h2;
-	}
-
-	return aa->h1 - bb->h1;
-}
-
-/* Get next word from specified f_str_t buf */
-gchar *
-rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
-{
-	gsize remain, pos;
-	guchar *p;
-	struct process_exception *ex = NULL;
-
-	if (buf == NULL) {
-		return NULL;
-	}
-
-	if (exceptions != NULL && *exceptions != NULL) {
-		ex = (*exceptions)->data;
-	}
-
-	if (token->begin == NULL) {
-		if (ex != NULL) {
-			if (ex->pos == 0) {
-				token->begin = buf->begin + ex->len;
-				token->len = ex->len;
-			}
-			else {
-				token->begin = buf->begin;
-				token->len = 0;
-			}
-		}
-		else {
-			token->begin = buf->begin;
-			token->len = 0;
-		}
-	}
-
-	token->len = 0;
-
-	pos = token->begin - buf->begin;
-	if (pos >= buf->len) {
-		return NULL;
-	}
-
-	remain = buf->len - pos;
-	p = token->begin;
-	/* Skip non delimiters symbols */
-	do {
-		if (ex != NULL && ex->pos == pos) {
-			/* Go to the next exception */
-			*exceptions = g_list_next (*exceptions);
-			return p + ex->len;
-		}
-		pos++;
-		p++;
-		remain--;
-	} while (remain > 0 && t_delimiters[*p]);
-
-	token->begin = p;
-
-	while (remain > 0 && !t_delimiters[*p]) {
-		if (ex != NULL && ex->pos == pos) {
-			*exceptions = g_list_next (*exceptions);
-			return p + ex->len;
-		}
-		token->len++;
-		pos++;
-		remain--;
-		p++;
-	}
-
-	if (remain == 0) {
-		return NULL;
-	}
-
-	return p;
-}
-
-GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-		gsize min_len, GList **exceptions)
-{
-	rspamd_fstring_t token, buf;
-	gchar *pos;
-	gsize l;
-	GArray *res;
-
-	if (len == 0 || text == NULL) {
-		return NULL;
-	}
-
-	buf.begin = text;
-	buf.len = len;
-	buf.size = buf.len;
-	token.begin = NULL;
-	token.len = 0;
-
-	res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
-	while ((pos = rspamd_tokenizer_get_word (&buf,
-			&token, exceptions)) != NULL) {
-		if (is_utf) {
-			l = g_utf8_strlen (token.begin, token.len);
-		}
-		else {
-			l = token.len;
-		}
-		if (min_len > 0 && l < min_len) {
-			token.begin = pos;
-			continue;
-		}
-		g_array_append_val (res, token);
-
-		token.begin = pos;
-	}
-
-	return res;
-}
-
-
-void
-tokenize_subject (struct rspamd_task *task, GTree ** tree)
-{
-	gchar *sub;
-	struct tokenizer *osb_tokenizer;
-	GArray *words;
-
-	if (*tree == NULL) {
-		*tree = g_tree_new (token_node_compare_func);
-		rspamd_mempool_add_destructor (task->task_pool,
-			(rspamd_mempool_destruct_t) g_tree_destroy, *tree);
-	}
-
-	osb_tokenizer = get_tokenizer ("osb-text");
-
-	/* Try to use pre-defined subject */
-	if (task->subject != NULL) {
-		sub = task->subject;
-	}
-	else {
-		sub = (gchar *)g_mime_message_get_subject (task->message);
-	}
-
-	if (sub != NULL) {
-		words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
-		if (words != NULL) {
-			osb_tokenizer->tokenize_func (osb_tokenizer,
-					task->task_pool,
-					words,
-					tree,
-					FALSE,
-					TRUE,
-					NULL);
-			g_array_free (words, TRUE);
-		}
-	}
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
deleted file mode 100644
index ed47e0add..000000000
--- a/src/tokenizers/tokenizers.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef TOKENIZERS_H
-#define TOKENIZERS_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "main.h"
-
-/* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-typedef struct token_node_s {
-	guint32 h1;
-	guint32 h2;
-	double value;
-	uintptr_t extra;
-} token_node_t;
-
-/* Common tokenizer structure */
-struct tokenizer {
-	gchar *name;
-	gint (*tokenize_func)(struct tokenizer *tokenizer,
-			rspamd_mempool_t *pool,
-			GArray *words,
-			GTree **cur,
-			gboolean save_token,
-			gboolean is_utf,
-			GList *exceptions);
-	gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
-};
-
-/* Compare two token nodes */
-int token_node_compare_func (gconstpointer a, gconstpointer b);
-
-/* Get tokenizer structure by name or return NULL if this name is not found */
-struct tokenizer * get_tokenizer (const char *name);
-
-/* Get next word from specified f_str_t buf */
-gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
-		rspamd_fstring_t *token, GList **exceptions);
-
-/* Tokenize text into array of words (rspamd_fstring_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-		gsize min_len, GList **exceptions);
-
-/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer,
-	rspamd_mempool_t *pool,
-	GArray *input,
-	GTree **cur,
-	gboolean save_token,
-	gboolean is_utf,
-	GList *exceptions);
-
-/* Make tokens for a subject */
-void tokenize_subject (struct rspamd_task *task, GTree ** tree);
-
-/* Array of all defined tokenizers */
-extern struct tokenizer tokenizers[];
-
-#endif
-/*
- * vi:ts=4
- */
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-01-16 15:28:40 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-01-16 15:28:40 +0000
commit	b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch)
tree	a647a4306708df37a3ea1d97666fd2d325e24464 /src/tokenizers
parent	ffd95d7c71307bb9540f07bbaac3b04859226837 (diff)
download	rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip