diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
commit | a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch) | |
tree | 352c634bbbc74cf17644545ace66a8feedc841c3 /src/tokenizers/tokenizers.c | |
parent | 63725086863e4f422340479f83dd7ef374613e76 (diff) | |
download | rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip |
* Welcome 0.4.0
Uncompatible changes:
- Statistics is uncompatible in utf8 mode
Major changes:
- Improved utf8 mode
- Convert all characters to lowercase in statistics
- Skip URL's in statistics
- Improve speed of bayes classifier by using integer arithmetics
- Fixed statfiles synchronization that was broken for a long time
- Synchronization is now configurable
Minor changes:
- Bugfixes
- Removed some of legacy code
- Types polishing
Diffstat (limited to 'src/tokenizers/tokenizers.c')
-rw-r--r-- | src/tokenizers/tokenizers.c | 64 |
1 files changed, 48 insertions, 16 deletions
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 9e41a9101..be73e506d 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -52,7 +52,7 @@ const gchar t_delimiters[255] = { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, - 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b) } /* Get next word from specified f_str_t buf */ -f_str_t * -get_next_word (f_str_t * buf, f_str_t * token) +gchar * +get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions) { - size_t remain; - guchar *pos; + gsize remain, pos; + guchar *p; + struct process_exception *ex = NULL; if (buf == NULL) { return NULL; } + + if (*exceptions != NULL) { + ex = (*exceptions)->data; + } + if (token->begin == NULL) { - token->begin = buf->begin; + if (ex != NULL) { + if (ex->pos == 0) { + token->begin = buf->begin + ex->len; + token->len = ex->len; + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + else { + token->begin = buf->begin; + token->len = 0; + } } - token->begin = token->begin + token->len; token->len = 0; remain = buf->len - (token->begin - buf->begin); if (remain <= 0) { return NULL; } - pos = token->begin; + pos = token->begin - buf->begin; + p = token->begin; /* Skip non delimiters symbols */ - while (remain > 0 && t_delimiters[*pos]) { - token->begin++; + do { + if (ex != NULL && ex->pos == pos) { + /* Go to the next exception */ + *exceptions = g_list_next (*exceptions); + return p + ex->len + 1; + } pos++; + p++; remain--; - } - while (remain > 0 && !t_delimiters[*pos]) { + } while (remain > 0 && t_delimiters[*p]); + + token->begin = p; + + while (remain > 0 && !t_delimiters[*p]) { + if (ex != NULL && ex->pos == pos) { + *exceptions = g_list_next (*exceptions); + return p + ex->len + 1; + } token->len++; pos++; remain--; + p ++; } - if (token->len == 0) { + if (remain == 0) { return NULL; } - return token; + return p; } /* Struct to access gmime headers */ @@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = task->subject; subject.len = strlen (task->subject); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); } if ((sub = g_mime_message_get_subject (task->message)) != NULL) { new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = (gchar *)sub; subject.len = strlen (sub); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); } } |