aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-01-24 20:45:54 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-01-24 20:45:54 +0300
commit76b69f300d8372969b6143e3e269376229d03edf (patch)
treed9c4dc4bfed5635869f2c9d83e9ebb94d00903a1 /src/tokenizers
parentb0d0a4ce50733ce162ce9738da2d416497f98763 (diff)
downloadrspamd-76b69f300d8372969b6143e3e269376229d03edf.tar.gz
rspamd-76b69f300d8372969b6143e3e269376229d03edf.zip
* Many fixes to fuzzy hashes logic and tokenization.
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/tokenizers.c37
1 files changed, 33 insertions, 4 deletions
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index ab073a28c..5e3d39c50 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -47,6 +47,35 @@ const int primes[] = {
797, 3277,
};
+const gchar t_delimiters[255] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
struct tokenizer *
get_tokenizer (char *name)
{
@@ -78,7 +107,7 @@ f_str_t *
get_next_word (f_str_t * buf, f_str_t * token)
{
size_t remain;
- unsigned char *pos;
+ guchar *pos;
if (buf == NULL) {
return NULL;
@@ -95,13 +124,13 @@ get_next_word (f_str_t * buf, f_str_t * token)
return NULL;
}
pos = token->begin;
- /* Skip non graph symbols */
- while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
+ /* Skip non delimiters symbols */
+ while (remain > 0 && t_delimiters[*pos]) {
token->begin++;
pos++;
remain--;
}
- while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
+ while (remain > 0 && !t_delimiters[*pos]) {
token->len++;
pos++;
remain--;