diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-11-07 19:35:13 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-11-07 19:35:13 +0300 |
commit | 62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch) | |
tree | a6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers/tokenizers.c | |
parent | 2175980532791f90807eb03ef99d6f7006ada4e6 (diff) | |
download | rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip |
* Add simple implementation of OSB tokenizer
Diffstat (limited to 'src/tokenizers/tokenizers.c')
-rw-r--r-- | src/tokenizers/tokenizers.c | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c new file mode 100644 index 000000000..132a57ce0 --- /dev/null +++ b/src/tokenizers/tokenizers.c @@ -0,0 +1,45 @@ +/* + * Common tokenization functions + */ + +#include <sys/types.h> +#include "tokenizers.h" + +/* Get next word from specified f_str_t buf */ +f_str_t * +get_next_word (f_str_t *buf, f_str_t *token) +{ + size_t remain; + char *pos; + + if (buf == NULL) { + return NULL; + } + + if (token->begin == NULL) { + token->begin = buf->begin; + } + + remain = buf->len - (token->begin - buf->begin); + if (remain <= 0) { + return NULL; + } + + token->begin = token->begin + token->len; + token->len = 0; + + pos = token->begin; + /* Skip non graph symbols */ + while (remain-- && !g_ascii_isgraph (*pos ++)) { + token->begin ++; + } + while (remain-- && g_ascii_isgraph (*pos ++)) { + token->len ++; + } + + return token; +} + +/* + * vi:ts=4 + */ |