* Add simple implementation of OSB tokenizer

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2008-11-07 19:35:13 +0300
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2008-11-07 19:35:13 +0300
commit: 62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch)
tree: a6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers
parent: 2175980532791f90807eb03ef99d6f7006ada4e6 (diff)
download: rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz
rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip
3 files changed, 143 insertions, 0 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
new file mode 100644
index 000000000..f78e20992
--- /dev/null
+++ b/src/tokenizers/osb.c
@@ -0,0 +1,69 @@
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+
+/* Coefficients that are used for OSB tokenizer */
+static const int primes[] = {
+	1, 7,
+	3, 13,
+	5, 29,
+	11, 51,
+	23, 101,
+	47, 203,
+	97, 407,
+	197, 817,
+	397, 1637,
+	797, 3277,
+};
+
+token_list_t *
+osb_tokenize_text (memory_pool_t *pool, f_str_t *input)
+{
+	token_list_t *new = NULL, *head = NULL, *last = NULL;
+	f_str_t token = { NULL, 0, 0 };
+	uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+	int i;
+
+	/* First set all bytes of hashpipe to some common value */
+	for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
+		hashpipe[i] = 0xABCDEF;
+	}
+
+	while (get_next_word (input, &token)) {
+		/* Shift hashpipe */
+		for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
+			hashpipe[i] = hashpipe[i - 1];
+		}
+		hashpipe[0] = fstrhash (&token);
+		
+		for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
+			h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
+		    h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
+			new = memory_pool_alloc (pool, sizeof (token_list_t));
+			new->h1 = h1;
+			new->h2 = h2;
+			if (last) {
+				last->next = new;
+			}
+			else {
+				head = new;
+			}
+			last = new;
+
+			msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+		}
+	}
+	if (last) {
+		last->next = NULL;
+	}
+
+	return head;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
new file mode 100644
index 000000000..132a57ce0
--- /dev/null
+++ b/src/tokenizers/tokenizers.c
@@ -0,0 +1,45 @@
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Get next word from specified f_str_t buf */
+f_str_t *
+get_next_word (f_str_t *buf, f_str_t *token)
+{
+	size_t remain;
+	char *pos;
+	
+	if (buf == NULL) {
+		return NULL;
+	}
+
+	if (token->begin == NULL) {
+		token->begin = buf->begin;
+	}
+
+	remain = buf->len - (token->begin - buf->begin);
+	if (remain <= 0) {
+		return NULL;
+	}
+
+	token->begin = token->begin + token->len;
+	token->len = 0;
+	
+	pos = token->begin;
+	/* Skip non graph symbols */
+	while (remain-- && !g_ascii_isgraph (*pos ++)) {
+		token->begin ++;
+	}
+	while (remain-- && g_ascii_isgraph (*pos ++)) {
+		token->len ++;
+	}
+	
+	return token;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
new file mode 100644
index 000000000..6b4bff5e0
--- /dev/null
+++ b/src/tokenizers/tokenizers.h
@@ -0,0 +1,29 @@
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../fstring.h"
+#include "../main.h"
+
+/* Size for features pipe */
+#define FEATURE_WINDOW_SIZE 5
+
+typedef struct token_list_s {
+	uint32_t h1;
+	uint32_t h2;
+	struct token_list_s *next;
+} token_list_t;
+
+/* Get next word from specified f_str_t buf */
+f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
+
+#endif
+/*
+ * vi:ts=4
+ */
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2008-11-07 19:35:13 +0300
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2008-11-07 19:35:13 +0300
commit	62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch)
tree	a6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers
parent	2175980532791f90807eb03ef99d6f7006ada4e6 (diff)
download	rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip