aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 18:41:00 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 18:41:00 +0300
commit249c0583d2a12ddde67e05251e47f256a58cfd05 (patch)
tree97c1db6e72d4bec5a2640425127c2d094fadea86
parent42b81716ece887b0011b1e40b0101ad37598997e (diff)
downloadrspamd-249c0583d2a12ddde67e05251e47f256a58cfd05.tar.gz
rspamd-249c0583d2a12ddde67e05251e47f256a58cfd05.zip
* Use binary tree in tokenizers, that would provide us fast checking for unique tokens and have O(log n) difficulty
-rw-r--r--src/tokenizers/osb.c27
-rw-r--r--src/tokenizers/tokenizers.c12
-rw-r--r--src/tokenizers/tokenizers.h12
3 files changed, 30 insertions, 21 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 6799a121b..be930af28 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -20,10 +20,11 @@ static const int primes[] = {
797, 3277,
};
-token_list_t *
+GTree *
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
{
- token_list_t *new = NULL, *head = NULL, *last = NULL;
+ token_node_t *new = NULL;
+ GTree *tree;
f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -33,6 +34,9 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
hashpipe[i] = 0xABCDEF;
}
+ tree = g_tree_new (token_node_compare_func);
+ memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);
+
while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
@@ -43,25 +47,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
- new = memory_pool_alloc (pool, sizeof (token_list_t));
+ new = memory_pool_alloc (pool, sizeof (token_node_t));
new->h1 = h1;
new->h2 = h2;
- if (last) {
- last->next = new;
- }
- else {
- head = new;
- }
- last = new;
- msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+ if (g_tree_lookup (tree, new) == NULL) {
+ msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+ g_tree_insert (tree, new, new);
+ }
}
}
- if (last) {
- last->next = NULL;
- }
- return head;
+ return tree;
}
/*
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 25b13a289..853207af4 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -23,6 +23,18 @@ get_tokenizer (char *name)
return NULL;
}
+int
+token_node_compare_func (gconstpointer a, gconstpointer b)
+{
+ const token_node_t *aa = a, *bb = b;
+
+ if (aa->h1 == bb->h1) {
+ return aa->h2 - bb->h2;
+ }
+
+ return aa->h1 - bb->h1;
+}
+
/* Get next word from specified f_str_t buf */
f_str_t *
get_next_word (f_str_t *buf, f_str_t *token)
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 96a2027a5..c3453a945 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -14,26 +14,26 @@
/* Size for features pipe */
#define FEATURE_WINDOW_SIZE 5
-typedef struct token_list_s {
+typedef struct token_node_s {
uint32_t h1;
uint32_t h2;
- struct token_list_s *next;
-} token_list_t;
-
+} token_node_t;
/* Common tokenizer structure */
struct tokenizer {
char *name;
- token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+ GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
};
+/* Compare two token nodes */
+int token_node_compare_func (gconstpointer a, gconstpointer b);
/* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
-token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
/* Array of all defined tokenizers */
extern struct tokenizer tokenizers[];