diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-01-24 20:45:54 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-01-24 20:45:54 +0300 |
commit | 76b69f300d8372969b6143e3e269376229d03edf (patch) | |
tree | d9c4dc4bfed5635869f2c9d83e9ebb94d00903a1 /src/tokenizers | |
parent | b0d0a4ce50733ce162ce9738da2d416497f98763 (diff) | |
download | rspamd-76b69f300d8372969b6143e3e269376229d03edf.tar.gz rspamd-76b69f300d8372969b6143e3e269376229d03edf.zip |
* Many fixes to fuzzy hashes logic and tokenization.
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/tokenizers.c | 37 |
1 files changed, 33 insertions, 4 deletions
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index ab073a28c..5e3d39c50 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -47,6 +47,35 @@ const int primes[] = { 797, 3277, }; +const gchar t_delimiters[255] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 +}; + struct tokenizer * get_tokenizer (char *name) { @@ -78,7 +107,7 @@ f_str_t * get_next_word (f_str_t * buf, f_str_t * token) { size_t remain; - unsigned char *pos; + guchar *pos; if (buf == NULL) { return NULL; @@ -95,13 +124,13 @@ get_next_word (f_str_t * buf, f_str_t * token) return NULL; } pos = token->begin; - /* Skip non graph symbols */ - while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) { + /* Skip non delimiters symbols */ + while (remain > 0 && t_delimiters[*pos]) { token->begin++; pos++; remain--; } - while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) { + while (remain > 0 && !t_delimiters[*pos]) { token->len++; pos++; remain--; |