Rework tokenization:

- Use normalized words if needed - Allow using of seeded XXHash instead of hand-made legacy shit - Allow secure hashing using siphash
author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-02-23 14:29:31 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-02-23 14:29:31 +0000
commit: 21a12878cc50c97444c41886b23e418087922783 (patch)
tree: c1f74997ac28d4355ebf2eb0997b0e6e2f22770c /src/libstat
parent: fec137a7cccd626ce248f619011b2570f75438f8 (diff)
download: rspamd-21a12878cc50c97444c41886b23e418087922783.tar.gz
rspamd-21a12878cc50c97444c41886b23e418087922783.zip
5 files changed, 147 insertions, 45 deletions
diff --git a/src/libstat/stat_config.c b/src/libstat/stat_config.c
index 8a0514721..8b537f732 100644
--- a/src/libstat/stat_config.c
+++ b/src/libstat/stat_config.c
@@ -41,7 +41,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = {
 };
 
 static struct rspamd_stat_tokenizer stat_tokenizers[] = {
-	{"osb-text", osb_tokenize_text},
+	{"osb-text", rspamd_tokenizer_osb},
 };
 
 static struct rspamd_stat_backend stat_backends[] = {
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index eafbe2092..f5a4b9398 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
 	GArray *words;
 	gchar *sub;
 	GList *cur;
+	const ucl_object_t *elt;
+	gboolean compat = TRUE;
+
+	/*
+	 * XXX: Ugly repetition to be backward compatible
+	 */
+	if (cf != NULL && cf->opts != NULL) {
+		elt = ucl_object_find_key (cf->opts, "hash");
+		if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+			if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) {
+				compat = FALSE;
+			}
+		}
+	}
 
 	cur = task->text_parts;
 
@@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
 			/*
 			 * XXX: Use normalized words if needed here
 			 */
-			tok->tokenizer->tokenize_func (cf, task->task_pool,
+
+			if (compat) {
+				tok->tokenizer->tokenize_func (cf, task->task_pool,
 					part->words, tok->tokens, part->is_utf);
+			}
+			else {
+				tok->tokenizer->tokenize_func (cf, task->task_pool,
+					part->normalized_words, tok->tokens, part->is_utf);
+			}
 		}
 
 		cur = g_list_next (cur);
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index b51e909a9..18157acd1 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -28,17 +28,28 @@
 
 #include "tokenizers.h"
 #include "stat_internal.h"
+#include "libstemmer.h"
+#include "xxhash.h"
+#include "siphash.h"
 
 /* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
+#define DEFAULT_FEATURE_WINDOW_SIZE 5
+
+static const int primes[] = {
+	1, 7,
+	3, 13,
+	5, 29,
+	11, 51,
+	23, 101,
+	47, 203,
+	97, 407,
+	197, 817,
+	397, 1637,
+	797, 3277,
+};
 
 int
-osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
 	rspamd_mempool_t * pool,
 	GArray * input,
 	GTree * tree,
@@ -46,9 +57,15 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
 {
 	rspamd_token_t *new = NULL;
 	rspamd_fstring_t *token;
-	guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
-	gint i, processed = 0;
-	guint w;
+	const ucl_object_t *elt;
+	guint64 *hashpipe, cur;
+	guint32 h1, h2;
+	guint processed = 0, i, w, window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+	gboolean compat = TRUE, secure = FALSE;
+	gint64 seed = 0xdeadbabe;
+	guchar *key = NULL;
+	gsize keylen;
+	struct sipkey sk;
 
 	g_assert (tree != NULL);
 
@@ -56,32 +73,100 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
 		return FALSE;
 	}
 
-	memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+	if (cf != NULL && cf->opts != NULL) {
+		elt = ucl_object_find_key (cf->opts, "hash");
+		if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+			if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3)
+					== 0) {
+				compat = FALSE;
+				secure = FALSE;
+				elt = ucl_object_find_key (cf->opts, "seed");
+				if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+					seed = ucl_object_toint (elt);
+				}
+			}
+			else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3)
+					== 0) {
+				compat = FALSE;
+				elt = ucl_object_find_key (cf->opts, "seed");
+
+				if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+					key = rspamd_decode_base32 (ucl_object_tostring (elt),
+							0, &keylen);
+					if (keylen < 16) {
+						msg_warn ("siphash seed is too short: %s", keylen);
+						g_free (key);
+					}
+					else {
+						secure = TRUE;
+						sip_tokey (&sk, key);
+						g_free (key);
+					}
+				}
+				else {
+					msg_warn ("siphash cannot be used without seed");
+				}
+
+			}
+		}
+		elt = ucl_object_find_key (cf->opts, "window");
+		if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+			window_size = ucl_object_toint (elt);
+			if (window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) {
+				msg_err ("too large window size: %d", window_size);
+				window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+			}
+		}
+	}
+
+	hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
+	memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
 
 	for (w = 0; w < input->len; w ++) {
 		token = &g_array_index (input, rspamd_fstring_t, w);
 
-		if (processed < FEATURE_WINDOW_SIZE) {
+		if (compat) {
+			cur = rspamd_fstrhash_lc (token, is_utf);
+		}
+		else {
+			/* We know that the words are normalized */
+			if (!secure) {
+				cur = XXH64 (token->begin, token->len, seed);
+			}
+			else {
+				cur = siphash24 (token->begin, token->len, &sk);
+			}
+		}
+
+		if (processed < window_size) {
 			/* Just fill a hashpipe */
-			hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
-				rspamd_fstrhash_lc (token, is_utf);
+			hashpipe[window_size - ++processed] = cur;
 		}
 		else {
 			/* Shift hashpipe */
-			for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+			for (i = window_size - 1; i > 0; i--) {
 				hashpipe[i] = hashpipe[i - 1];
 			}
-			hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+			hashpipe[0] = cur;
 			processed++;
 
-			for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
-				h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-				h2 = hashpipe[0] * primes[1] + hashpipe[i] *
-					primes[(i << 1) - 1];
+			for (i = 1; i < window_size; i++) {
 				new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-				new->datalen = sizeof(gint32) * 2;
-				memcpy(new->data, &h1, sizeof(h1));
-				memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+				new->datalen = sizeof (gint64);
+
+				if (compat) {
+					h1 = ((guint32)hashpipe[0]) * primes[0] +
+							((guint32)hashpipe[i]) * primes[i << 1];
+					h2 = ((guint32)hashpipe[0]) * primes[1] +
+							((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+
+					memcpy(new->data, &h1, sizeof (h1));
+					memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+				}
+				else {
+					cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+					memcpy (new->data, &cur, sizeof (cur));
+				}
 
 				if (g_tree_lookup (tree, new) == NULL) {
 					g_tree_insert (tree, new, new);
@@ -90,14 +175,23 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
 		}
 	}
 
-	if (processed <= FEATURE_WINDOW_SIZE) {
+	if (processed <= window_size) {
 		for (i = 1; i < processed; i++) {
-			h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-			h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
 			new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-			new->datalen = sizeof(gint32) * 2;
-			memcpy(new->data, &h1, sizeof(h1));
-			memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+			new->datalen = sizeof (gint64);
+
+			if (compat) {
+				h1 = ((guint32)hashpipe[0]) * primes[0] +
+						((guint32)hashpipe[i]) * primes[i << 1];
+				h2 = ((guint32)hashpipe[0]) * primes[1] +
+						((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+				memcpy(new->data, &h1, sizeof (h1));
+				memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+			}
+			else {
+				cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+				memcpy (new->data, &cur, sizeof (cur));
+			}
 
 			if (g_tree_lookup (tree, new) == NULL) {
 				g_tree_insert (tree, new, new);
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 6ec7b1e10..2abe0f318 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -30,19 +30,6 @@
 #include "tokenizers.h"
 #include "stat_internal.h"
 
-const int primes[] = {
-	1, 7,
-	3, 13,
-	5, 29,
-	11, 51,
-	23, 101,
-	47, 203,
-	97, 407,
-	197, 817,
-	397, 1637,
-	797, 3277,
-};
-
 const gchar t_delimiters[255] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
 	1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 0bc594842..bab18b00a 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
 		gsize min_len, GList **exceptions);
 
 /* OSB tokenize function */
-int osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+int rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
 	rspamd_mempool_t *pool,
 	GArray *input,
 	GTree *tokens,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-02-23 14:29:31 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-02-23 14:29:31 +0000
commit	21a12878cc50c97444c41886b23e418087922783 (patch)
tree	c1f74997ac28d4355ebf2eb0997b0e6e2f22770c /src/libstat
parent	fec137a7cccd626ce248f619011b2570f75438f8 (diff)
download	rspamd-21a12878cc50c97444c41886b23e418087922783.tar.gz rspamd-21a12878cc50c97444c41886b23e418087922783.zip