1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
/*
* OSB tokenizer
*/
#include <sys/types.h>
#include "tokenizers.h"
/* Coefficients that are used for OSB tokenizer */
static const int primes[] = {
1, 7,
3, 13,
5, 29,
11, 51,
23, 101,
47, 203,
97, 407,
197, 817,
397, 1637,
797, 3277,
};
int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
{
token_node_t *new = NULL;
f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
/* First set all bytes of hashpipe to some common value */
for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
hashpipe[i] = 0xABCDEF;
}
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
}
msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
hashpipe[i] = hashpipe[i - 1];
}
hashpipe[0] = fstrhash (&token);
for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
new = memory_pool_alloc (pool, sizeof (token_node_t));
new->h1 = h1;
new->h2 = h2;
if (g_tree_lookup (*tree, new) == NULL) {
g_tree_insert (*tree, new, new);
}
}
}
return TRUE;
}
/*
* vi:ts=4
*/
|