797, 3277, | 797, 3277, | ||||
}; | }; | ||||
token_list_t * | |||||
GTree * | |||||
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) | osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) | ||||
{ | { | ||||
token_list_t *new = NULL, *head = NULL, *last = NULL; | |||||
token_node_t *new = NULL; | |||||
GTree *tree; | |||||
f_str_t token = { NULL, 0, 0 }; | f_str_t token = { NULL, 0, 0 }; | ||||
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; | uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; | ||||
int i; | int i; | ||||
hashpipe[i] = 0xABCDEF; | hashpipe[i] = 0xABCDEF; | ||||
} | } | ||||
tree = g_tree_new (token_node_compare_func); | |||||
memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree); | |||||
while (tokenizer->get_next_word (input, &token)) { | while (tokenizer->get_next_word (input, &token)) { | ||||
/* Shift hashpipe */ | /* Shift hashpipe */ | ||||
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { | for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { | ||||
for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { | for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { | ||||
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; | h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; | ||||
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; | h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; | ||||
new = memory_pool_alloc (pool, sizeof (token_list_t)); | |||||
new = memory_pool_alloc (pool, sizeof (token_node_t)); | |||||
new->h1 = h1; | new->h1 = h1; | ||||
new->h2 = h2; | new->h2 = h2; | ||||
if (last) { | |||||
last->next = new; | |||||
} | |||||
else { | |||||
head = new; | |||||
} | |||||
last = new; | |||||
msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); | |||||
if (g_tree_lookup (tree, new) == NULL) { | |||||
msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); | |||||
g_tree_insert (tree, new, new); | |||||
} | |||||
} | } | ||||
} | } | ||||
if (last) { | |||||
last->next = NULL; | |||||
} | |||||
return head; | |||||
return tree; | |||||
} | } | ||||
/* | /* |
return NULL; | return NULL; | ||||
} | } | ||||
int | |||||
token_node_compare_func (gconstpointer a, gconstpointer b) | |||||
{ | |||||
const token_node_t *aa = a, *bb = b; | |||||
if (aa->h1 == bb->h1) { | |||||
return aa->h2 - bb->h2; | |||||
} | |||||
return aa->h1 - bb->h1; | |||||
} | |||||
/* Get next word from specified f_str_t buf */ | /* Get next word from specified f_str_t buf */ | ||||
f_str_t * | f_str_t * | ||||
get_next_word (f_str_t *buf, f_str_t *token) | get_next_word (f_str_t *buf, f_str_t *token) |
/* Size for features pipe */ | /* Size for features pipe */ | ||||
#define FEATURE_WINDOW_SIZE 5 | #define FEATURE_WINDOW_SIZE 5 | ||||
typedef struct token_list_s { | |||||
typedef struct token_node_s { | |||||
uint32_t h1; | uint32_t h1; | ||||
uint32_t h2; | uint32_t h2; | ||||
struct token_list_s *next; | |||||
} token_list_t; | |||||
} token_node_t; | |||||
/* Common tokenizer structure */ | /* Common tokenizer structure */ | ||||
struct tokenizer { | struct tokenizer { | ||||
char *name; | char *name; | ||||
token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); | |||||
GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); | |||||
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); | f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); | ||||
}; | }; | ||||
/* Compare two token nodes */ | |||||
int token_node_compare_func (gconstpointer a, gconstpointer b); | |||||
/* Get tokenizer structure by name or return NULL if this name is not found */ | /* Get tokenizer structure by name or return NULL if this name is not found */ | ||||
struct tokenizer* get_tokenizer (char *name); | struct tokenizer* get_tokenizer (char *name); | ||||
/* Get next word from specified f_str_t buf */ | /* Get next word from specified f_str_t buf */ | ||||
f_str_t *get_next_word (f_str_t *buf, f_str_t *token); | f_str_t *get_next_word (f_str_t *buf, f_str_t *token); | ||||
/* OSB tokenize function */ | /* OSB tokenize function */ | ||||
token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); | |||||
GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); | |||||
/* Array of all defined tokenizers */ | /* Array of all defined tokenizers */ | ||||
extern struct tokenizer tokenizers[]; | extern struct tokenizer tokenizers[]; |