diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-12-04 18:41:00 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-12-04 18:41:00 +0300 |
commit | 249c0583d2a12ddde67e05251e47f256a58cfd05 (patch) | |
tree | 97c1db6e72d4bec5a2640425127c2d094fadea86 | |
parent | 42b81716ece887b0011b1e40b0101ad37598997e (diff) | |
download | rspamd-249c0583d2a12ddde67e05251e47f256a58cfd05.tar.gz rspamd-249c0583d2a12ddde67e05251e47f256a58cfd05.zip |
* Use binary tree in tokenizers, that would provide us fast checking for unique tokens and have O(log n) difficulty
-rw-r--r-- | src/tokenizers/osb.c | 27 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 12 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 12 |
3 files changed, 30 insertions, 21 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 6799a121b..be930af28 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -20,10 +20,11 @@ static const int primes[] = { 797, 3277, }; -token_list_t * +GTree * osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) { - token_list_t *new = NULL, *head = NULL, *last = NULL; + token_node_t *new = NULL; + GTree *tree; f_str_t token = { NULL, 0, 0 }; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; int i; @@ -33,6 +34,9 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in hashpipe[i] = 0xABCDEF; } + tree = g_tree_new (token_node_compare_func); + memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree); + while (tokenizer->get_next_word (input, &token)) { /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { @@ -43,25 +47,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; - new = memory_pool_alloc (pool, sizeof (token_list_t)); + new = memory_pool_alloc (pool, sizeof (token_node_t)); new->h1 = h1; new->h2 = h2; - if (last) { - last->next = new; - } - else { - head = new; - } - last = new; - msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); + if (g_tree_lookup (tree, new) == NULL) { + msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); + g_tree_insert (tree, new, new); + } } } - if (last) { - last->next = NULL; - } - return head; + return tree; } /* diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 25b13a289..853207af4 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -23,6 +23,18 @@ get_tokenizer (char *name) return NULL; } +int +token_node_compare_func (gconstpointer a, gconstpointer b) +{ + const token_node_t *aa = a, *bb = b; + + if (aa->h1 == bb->h1) { + return aa->h2 - bb->h2; + } + + return aa->h1 - bb->h1; +} + /* Get next word from specified f_str_t buf */ f_str_t * get_next_word (f_str_t *buf, f_str_t *token) diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 96a2027a5..c3453a945 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -14,26 +14,26 @@ /* Size for features pipe */ #define FEATURE_WINDOW_SIZE 5 -typedef struct token_list_s { +typedef struct token_node_s { uint32_t h1; uint32_t h2; - struct token_list_s *next; -} token_list_t; - +} token_node_t; /* Common tokenizer structure */ struct tokenizer { char *name; - token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); + GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); }; +/* Compare two token nodes */ +int token_node_compare_func (gconstpointer a, gconstpointer b); /* Get tokenizer structure by name or return NULL if this name is not found */ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ f_str_t *get_next_word (f_str_t *buf, f_str_t *token); /* OSB tokenize function */ -token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); +GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); /* Array of all defined tokenizers */ extern struct tokenizer tokenizers[]; |