Просмотр исходного кода

* Use binary tree in tokenizers, that would provide us fast checking for unique tokens and have O(log n) difficulty

tags/0.2.7
Vsevolod Stakhov 15 лет назад
Родитель
Сommit
249c0583d2
3 измененных файлов: 30 добавлений и 21 удалений
  1. 12
    15
      src/tokenizers/osb.c
  2. 12
    0
      src/tokenizers/tokenizers.c
  3. 6
    6
      src/tokenizers/tokenizers.h

+ 12
- 15
src/tokenizers/osb.c Просмотреть файл

797, 3277, 797, 3277,
}; };


token_list_t *
GTree *
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
{ {
token_list_t *new = NULL, *head = NULL, *last = NULL;
token_node_t *new = NULL;
GTree *tree;
f_str_t token = { NULL, 0, 0 }; f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i; int i;
hashpipe[i] = 0xABCDEF; hashpipe[i] = 0xABCDEF;
} }


tree = g_tree_new (token_node_compare_func);
memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);

while (tokenizer->get_next_word (input, &token)) { while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */ /* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
new = memory_pool_alloc (pool, sizeof (token_list_t));
new = memory_pool_alloc (pool, sizeof (token_node_t));
new->h1 = h1; new->h1 = h1;
new->h2 = h2; new->h2 = h2;
if (last) {
last->next = new;
}
else {
head = new;
}
last = new;


msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
if (g_tree_lookup (tree, new) == NULL) {
msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
g_tree_insert (tree, new, new);
}
} }
} }
if (last) {
last->next = NULL;
}


return head;
return tree;
} }


/* /*

+ 12
- 0
src/tokenizers/tokenizers.c Просмотреть файл

return NULL; return NULL;
} }


int
token_node_compare_func (gconstpointer a, gconstpointer b)
{
const token_node_t *aa = a, *bb = b;
if (aa->h1 == bb->h1) {
return aa->h2 - bb->h2;
}

return aa->h1 - bb->h1;
}

/* Get next word from specified f_str_t buf */ /* Get next word from specified f_str_t buf */
f_str_t * f_str_t *
get_next_word (f_str_t *buf, f_str_t *token) get_next_word (f_str_t *buf, f_str_t *token)

+ 6
- 6
src/tokenizers/tokenizers.h Просмотреть файл

/* Size for features pipe */ /* Size for features pipe */
#define FEATURE_WINDOW_SIZE 5 #define FEATURE_WINDOW_SIZE 5


typedef struct token_list_s {
typedef struct token_node_s {
uint32_t h1; uint32_t h1;
uint32_t h2; uint32_t h2;
struct token_list_s *next;
} token_list_t;

} token_node_t;


/* Common tokenizer structure */ /* Common tokenizer structure */
struct tokenizer { struct tokenizer {
char *name; char *name;
token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
}; };


/* Compare two token nodes */
int token_node_compare_func (gconstpointer a, gconstpointer b);
/* Get tokenizer structure by name or return NULL if this name is not found */ /* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer* get_tokenizer (char *name); struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */ /* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token); f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */ /* OSB tokenize function */
token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);


/* Array of all defined tokenizers */ /* Array of all defined tokenizers */
extern struct tokenizer tokenizers[]; extern struct tokenizer tokenizers[];

Загрузка…
Отмена
Сохранить