diff options
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 4 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 18 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 15 |
3 files changed, 35 insertions, 2 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index f78e20992..6799a121b 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -21,7 +21,7 @@ static const int primes[] = { }; token_list_t * -osb_tokenize_text (memory_pool_t *pool, f_str_t *input) +osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) { token_list_t *new = NULL, *head = NULL, *last = NULL; f_str_t token = { NULL, 0, 0 }; @@ -33,7 +33,7 @@ osb_tokenize_text (memory_pool_t *pool, f_str_t *input) hashpipe[i] = 0xABCDEF; } - while (get_next_word (input, &token)) { + while (tokenizer->get_next_word (input, &token)) { /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { hashpipe[i] = hashpipe[i - 1]; diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 132a57ce0..25b13a289 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -5,6 +5,24 @@ #include <sys/types.h> #include "tokenizers.h" +struct tokenizer tokenizers[] = { + {"osb-text", osb_tokenize_text, get_next_word }, +}; + +struct tokenizer* +get_tokenizer (char *name) +{ + int i; + + for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i ++) { + if (strcmp (tokenizers[i].name, name) == 0) { + return &tokenizers[i]; + } + } + + return NULL; +} + /* Get next word from specified f_str_t buf */ f_str_t * get_next_word (f_str_t *buf, f_str_t *token) diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 6b4bff5e0..96a2027a5 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -20,8 +20,23 @@ typedef struct token_list_s { struct token_list_s *next; } token_list_t; + +/* Common tokenizer structure */ +struct tokenizer { + char *name; + token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); + f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); +}; + +/* Get tokenizer structure by name or return NULL if this name is not found */ +struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ f_str_t *get_next_word (f_str_t *buf, f_str_t *token); +/* OSB tokenize function */ +token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); + +/* Array of all defined tokenizers */ +extern struct tokenizer tokenizers[]; #endif /* |