Przeglądaj źródła

* Use binary tree in tokenizers, that would provide us fast checking for unique tokens and have O(log n) difficulty

tags/0.2.7
Vsevolod Stakhov 15 lat temu
rodzic
commit
249c0583d2
3 zmienionych plików z 30 dodań i 21 usunięć
  1. 12
    15
      src/tokenizers/osb.c
  2. 12
    0
      src/tokenizers/tokenizers.c
  3. 6
    6
      src/tokenizers/tokenizers.h

+ 12
- 15
src/tokenizers/osb.c Wyświetl plik

@@ -20,10 +20,11 @@ static const int primes[] = {
797, 3277,
};

token_list_t *
GTree *
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
{
token_list_t *new = NULL, *head = NULL, *last = NULL;
token_node_t *new = NULL;
GTree *tree;
f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -33,6 +34,9 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
hashpipe[i] = 0xABCDEF;
}

tree = g_tree_new (token_node_compare_func);
memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);

while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
@@ -43,25 +47,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
new = memory_pool_alloc (pool, sizeof (token_list_t));
new = memory_pool_alloc (pool, sizeof (token_node_t));
new->h1 = h1;
new->h2 = h2;
if (last) {
last->next = new;
}
else {
head = new;
}
last = new;

msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
if (g_tree_lookup (tree, new) == NULL) {
msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
g_tree_insert (tree, new, new);
}
}
}
if (last) {
last->next = NULL;
}

return head;
return tree;
}

/*

+ 12
- 0
src/tokenizers/tokenizers.c Wyświetl plik

@@ -23,6 +23,18 @@ get_tokenizer (char *name)
return NULL;
}

int
token_node_compare_func (gconstpointer a, gconstpointer b)
{
const token_node_t *aa = a, *bb = b;
if (aa->h1 == bb->h1) {
return aa->h2 - bb->h2;
}

return aa->h1 - bb->h1;
}

/* Get next word from specified f_str_t buf */
f_str_t *
get_next_word (f_str_t *buf, f_str_t *token)

+ 6
- 6
src/tokenizers/tokenizers.h Wyświetl plik

@@ -14,26 +14,26 @@
/* Size for features pipe */
#define FEATURE_WINDOW_SIZE 5

typedef struct token_list_s {
typedef struct token_node_s {
uint32_t h1;
uint32_t h2;
struct token_list_s *next;
} token_list_t;

} token_node_t;

/* Common tokenizer structure */
struct tokenizer {
char *name;
token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
};

/* Compare two token nodes */
int token_node_compare_func (gconstpointer a, gconstpointer b);
/* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);

/* Array of all defined tokenizers */
extern struct tokenizer tokenizers[];

Ładowanie…
Anuluj
Zapisz