summaryrefslogtreecommitdiffstats
path: root/src/tokenizers/tokenizers.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-11-07 19:35:13 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-11-07 19:35:13 +0300
commit62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch)
treea6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers/tokenizers.c
parent2175980532791f90807eb03ef99d6f7006ada4e6 (diff)
downloadrspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz
rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip
* Add simple implementation of OSB tokenizer
Diffstat (limited to 'src/tokenizers/tokenizers.c')
-rw-r--r--src/tokenizers/tokenizers.c45
1 files changed, 45 insertions, 0 deletions
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
new file mode 100644
index 000000000..132a57ce0
--- /dev/null
+++ b/src/tokenizers/tokenizers.c
@@ -0,0 +1,45 @@
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Get next word from specified f_str_t buf */
+f_str_t *
+get_next_word (f_str_t *buf, f_str_t *token)
+{
+ size_t remain;
+ char *pos;
+
+ if (buf == NULL) {
+ return NULL;
+ }
+
+ if (token->begin == NULL) {
+ token->begin = buf->begin;
+ }
+
+ remain = buf->len - (token->begin - buf->begin);
+ if (remain <= 0) {
+ return NULL;
+ }
+
+ token->begin = token->begin + token->len;
+ token->len = 0;
+
+ pos = token->begin;
+ /* Skip non graph symbols */
+ while (remain-- && !g_ascii_isgraph (*pos ++)) {
+ token->begin ++;
+ }
+ while (remain-- && g_ascii_isgraph (*pos ++)) {
+ token->len ++;
+ }
+
+ return token;
+}
+
+/*
+ * vi:ts=4
+ */