aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c4
-rw-r--r--src/tokenizers/tokenizers.c18
-rw-r--r--src/tokenizers/tokenizers.h15
3 files changed, 35 insertions, 2 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index f78e20992..6799a121b 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -21,7 +21,7 @@ static const int primes[] = {
};
token_list_t *
-osb_tokenize_text (memory_pool_t *pool, f_str_t *input)
+osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
{
token_list_t *new = NULL, *head = NULL, *last = NULL;
f_str_t token = { NULL, 0, 0 };
@@ -33,7 +33,7 @@ osb_tokenize_text (memory_pool_t *pool, f_str_t *input)
hashpipe[i] = 0xABCDEF;
}
- while (get_next_word (input, &token)) {
+ while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
hashpipe[i] = hashpipe[i - 1];
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 132a57ce0..25b13a289 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -5,6 +5,24 @@
#include <sys/types.h>
#include "tokenizers.h"
+struct tokenizer tokenizers[] = {
+ {"osb-text", osb_tokenize_text, get_next_word },
+};
+
+struct tokenizer*
+get_tokenizer (char *name)
+{
+ int i;
+
+ for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i ++) {
+ if (strcmp (tokenizers[i].name, name) == 0) {
+ return &tokenizers[i];
+ }
+ }
+
+ return NULL;
+}
+
/* Get next word from specified f_str_t buf */
f_str_t *
get_next_word (f_str_t *buf, f_str_t *token)
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 6b4bff5e0..96a2027a5 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -20,8 +20,23 @@ typedef struct token_list_s {
struct token_list_s *next;
} token_list_t;
+
+/* Common tokenizer structure */
+struct tokenizer {
+ char *name;
+ token_list_t* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+ f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
+};
+
+/* Get tokenizer structure by name or return NULL if this name is not found */
+struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
+/* OSB tokenize function */
+token_list_t* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+
+/* Array of all defined tokenizers */
+extern struct tokenizer tokenizers[];
#endif
/*