aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/tokenizers.c37
1 files changed, 33 insertions, 4 deletions
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index ab073a28c..5e3d39c50 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -47,6 +47,35 @@ const int primes[] = {
797, 3277,
};
+const gchar t_delimiters[255] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
struct tokenizer *
get_tokenizer (char *name)
{
@@ -78,7 +107,7 @@ f_str_t *
get_next_word (f_str_t * buf, f_str_t * token)
{
size_t remain;
- unsigned char *pos;
+ guchar *pos;
if (buf == NULL) {
return NULL;
@@ -95,13 +124,13 @@ get_next_word (f_str_t * buf, f_str_t * token)
return NULL;
}
pos = token->begin;
- /* Skip non graph symbols */
- while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
+ /* Skip non delimiters symbols */
+ while (remain > 0 && t_delimiters[*pos]) {
token->begin++;
pos++;
remain--;
}
- while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
+ while (remain > 0 && !t_delimiters[*pos]) {
token->len++;
pos++;
remain--;