aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c2
-rw-r--r--src/tokenizers/tokenizers.c10
2 files changed, 8 insertions, 4 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index be930af28..afd2febd8 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -44,7 +44,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
}
hashpipe[0] = fstrhash (&token);
- for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
+ for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
new = memory_pool_alloc (pool, sizeof (token_node_t));
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 853207af4..280ebd477 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -50,14 +50,14 @@ get_next_word (f_str_t *buf, f_str_t *token)
token->begin = buf->begin;
}
+ token->begin = token->begin + token->len;
+ token->len = 0;
+
remain = buf->len - (token->begin - buf->begin);
if (remain <= 0) {
return NULL;
}
- token->begin = token->begin + token->len;
- token->len = 0;
-
pos = token->begin;
/* Skip non graph symbols */
while (remain-- && !g_ascii_isgraph (*pos ++)) {
@@ -66,6 +66,10 @@ get_next_word (f_str_t *buf, f_str_t *token)
while (remain-- && g_ascii_isgraph (*pos ++)) {
token->len ++;
}
+
+ if (token->len == 0) {
+ return NULL;
+ }
return token;
}