diff options
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 2 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 10 |
2 files changed, 8 insertions, 4 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index be930af28..afd2febd8 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -44,7 +44,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in } hashpipe[0] = fstrhash (&token); - for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { + for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) { h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; new = memory_pool_alloc (pool, sizeof (token_node_t)); diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 853207af4..280ebd477 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -50,14 +50,14 @@ get_next_word (f_str_t *buf, f_str_t *token) token->begin = buf->begin; } + token->begin = token->begin + token->len; + token->len = 0; + remain = buf->len - (token->begin - buf->begin); if (remain <= 0) { return NULL; } - token->begin = token->begin + token->len; - token->len = 0; - pos = token->begin; /* Skip non graph symbols */ while (remain-- && !g_ascii_isgraph (*pos ++)) { @@ -66,6 +66,10 @@ get_next_word (f_str_t *buf, f_str_t *token) while (remain-- && g_ascii_isgraph (*pos ++)) { token->len ++; } + + if (token->len == 0) { + return NULL; + } return token; } |