aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers/osb.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 18:41:00 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 18:41:00 +0300
commit249c0583d2a12ddde67e05251e47f256a58cfd05 (patch)
tree97c1db6e72d4bec5a2640425127c2d094fadea86 /src/tokenizers/osb.c
parent42b81716ece887b0011b1e40b0101ad37598997e (diff)
downloadrspamd-249c0583d2a12ddde67e05251e47f256a58cfd05.tar.gz
rspamd-249c0583d2a12ddde67e05251e47f256a58cfd05.zip
* Use binary tree in tokenizers, that would provide us fast checking for unique tokens and have O(log n) difficulty
Diffstat (limited to 'src/tokenizers/osb.c')
-rw-r--r--src/tokenizers/osb.c27
1 files changed, 12 insertions, 15 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 6799a121b..be930af28 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -20,10 +20,11 @@ static const int primes[] = {
797, 3277,
};
-token_list_t *
+GTree *
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
{
- token_list_t *new = NULL, *head = NULL, *last = NULL;
+ token_node_t *new = NULL;
+ GTree *tree;
f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -33,6 +34,9 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
hashpipe[i] = 0xABCDEF;
}
+ tree = g_tree_new (token_node_compare_func);
+ memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);
+
while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
@@ -43,25 +47,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
- new = memory_pool_alloc (pool, sizeof (token_list_t));
+ new = memory_pool_alloc (pool, sizeof (token_node_t));
new->h1 = h1;
new->h2 = h2;
- if (last) {
- last->next = new;
- }
- else {
- head = new;
- }
- last = new;
- msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+ if (g_tree_lookup (tree, new) == NULL) {
+ msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+ g_tree_insert (tree, new, new);
+ }
}
}
- if (last) {
- last->next = NULL;
- }
- return head;
+ return tree;
}
/*