* Add simple implementation of OSB tokenizer

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2008-11-07 19:35:13 +0300
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2008-11-07 19:35:13 +0300
commit: 62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch)
tree: a6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers/osb.c
parent: 2175980532791f90807eb03ef99d6f7006ada4e6 (diff)
download: rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz
rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip
1 files changed, 69 insertions, 0 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
new file mode 100644
index 000000000..f78e20992
--- /dev/null
+++ b/src/tokenizers/osb.c
@@ -0,0 +1,69 @@
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+
+/* Coefficients that are used for OSB tokenizer */
+static const int primes[] = {
+	1, 7,
+	3, 13,
+	5, 29,
+	11, 51,
+	23, 101,
+	47, 203,
+	97, 407,
+	197, 817,
+	397, 1637,
+	797, 3277,
+};
+
+token_list_t *
+osb_tokenize_text (memory_pool_t *pool, f_str_t *input)
+{
+	token_list_t *new = NULL, *head = NULL, *last = NULL;
+	f_str_t token = { NULL, 0, 0 };
+	uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+	int i;
+
+	/* First set all bytes of hashpipe to some common value */
+	for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
+		hashpipe[i] = 0xABCDEF;
+	}
+
+	while (get_next_word (input, &token)) {
+		/* Shift hashpipe */
+		for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
+			hashpipe[i] = hashpipe[i - 1];
+		}
+		hashpipe[0] = fstrhash (&token);
+		
+		for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
+			h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
+		    h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
+			new = memory_pool_alloc (pool, sizeof (token_list_t));
+			new->h1 = h1;
+			new->h2 = h2;
+			if (last) {
+				last->next = new;
+			}
+			else {
+				head = new;
+			}
+			last = new;
+
+			msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+		}
+	}
+	if (last) {
+		last->next = NULL;
+	}
+
+	return head;
+}
+
+/*
+ * vi:ts=4
+ */
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2008-11-07 19:35:13 +0300
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2008-11-07 19:35:13 +0300
commit	62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch)
tree	a6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers/osb.c
parent	2175980532791f90807eb03ef99d6f7006ada4e6 (diff)
download	rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip