diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
commit | a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch) | |
tree | 352c634bbbc74cf17644545ace66a8feedc841c3 /src/tokenizers | |
parent | 63725086863e4f422340479f83dd7ef374613e76 (diff) | |
download | rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip |
* Welcome 0.4.0
Uncompatible changes:
- Statistics is uncompatible in utf8 mode
Major changes:
- Improved utf8 mode
- Convert all characters to lowercase in statistics
- Skip URL's in statistics
- Improve speed of bayes classifier by using integer arithmetics
- Fixed statfiles synchronization that was broken for a long time
- Synchronization is now configurable
Minor changes:
- Bugfixes
- Removed some of legacy code
- Types polishing
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 57 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 64 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 16 |
3 files changed, 86 insertions, 51 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 5f5dfcdcd..bc57255cb 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -36,55 +36,56 @@ extern const int primes[]; int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, - gboolean save_token, gboolean is_utf) + gboolean save_token, gboolean is_utf, GList *exceptions) { token_node_t *new = NULL; - f_str_t token = { NULL, 0, 0 }, *res; - uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - int i; - - /* First set all bytes of hashpipe to some common value */ - for (i = 0; i < FEATURE_WINDOW_SIZE; i++) { - hashpipe[i] = 0xABCDEF; - } + f_str_t token = { NULL, 0, 0 }; + guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + gint i, k = 0, l; + gchar *res; if (*tree == NULL) { *tree = g_tree_new (token_node_compare_func); memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree); } - while ((res = tokenizer->get_next_word (input, &token)) != NULL) { + while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { /* Skip small words */ if (is_utf) { - if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) { - continue; - } + l = g_utf8_strlen (token.begin, token.len); } else { - if (token.len < MIN_LEN) { - continue; - } + l = token.len; } + if (l < MIN_LEN) { + token.begin = res; + continue; + } + /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { hashpipe[i] = hashpipe[i - 1]; } - hashpipe[0] = fstrhash (&token); + hashpipe[0] = fstrhash_lowercase (&token, is_utf); - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = memory_pool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); - } + if (k > FEATURE_WINDOW_SIZE) { + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = memory_pool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); + } - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } } } + k ++; + token.begin = res; } return TRUE; diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 9e41a9101..be73e506d 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -52,7 +52,7 @@ const gchar t_delimiters[255] = { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, - 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b) } /* Get next word from specified f_str_t buf */ -f_str_t * -get_next_word (f_str_t * buf, f_str_t * token) +gchar * +get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions) { - size_t remain; - guchar *pos; + gsize remain, pos; + guchar *p; + struct process_exception *ex = NULL; if (buf == NULL) { return NULL; } + + if (*exceptions != NULL) { + ex = (*exceptions)->data; + } + if (token->begin == NULL) { - token->begin = buf->begin; + if (ex != NULL) { + if (ex->pos == 0) { + token->begin = buf->begin + ex->len; + token->len = ex->len; + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + else { + token->begin = buf->begin; + token->len = 0; + } } - token->begin = token->begin + token->len; token->len = 0; remain = buf->len - (token->begin - buf->begin); if (remain <= 0) { return NULL; } - pos = token->begin; + pos = token->begin - buf->begin; + p = token->begin; /* Skip non delimiters symbols */ - while (remain > 0 && t_delimiters[*pos]) { - token->begin++; + do { + if (ex != NULL && ex->pos == pos) { + /* Go to the next exception */ + *exceptions = g_list_next (*exceptions); + return p + ex->len + 1; + } pos++; + p++; remain--; - } - while (remain > 0 && !t_delimiters[*pos]) { + } while (remain > 0 && t_delimiters[*p]); + + token->begin = p; + + while (remain > 0 && !t_delimiters[*p]) { + if (ex != NULL && ex->pos == pos) { + *exceptions = g_list_next (*exceptions); + return p + ex->len + 1; + } token->len++; pos++; remain--; + p ++; } - if (token->len == 0) { + if (remain == 0) { return NULL; } - return token; + return p; } /* Struct to access gmime headers */ @@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = task->subject; subject.len = strlen (task->subject); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); } if ((sub = g_mime_message_get_subject (task->message)) != NULL) { new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = (gchar *)sub; subject.len = strlen (sub); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); } } diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index df5481a1f..c78d90b0e 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -15,17 +15,18 @@ #define FEATURE_WINDOW_SIZE 5 typedef struct token_node_s { - uint32_t h1; - uint32_t h2; + guint32 h1; + guint32 h2; float value; uintptr_t extra; } token_node_t; /* Common tokenizer structure */ struct tokenizer { - char *name; - int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); - f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); + gchar *name; + gint (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, + GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); + gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions); }; /* Compare two token nodes */ @@ -33,9 +34,10 @@ int token_node_compare_func (gconstpointer a, gconstpointer b); /* Get tokenizer structure by name or return NULL if this name is not found */ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ -f_str_t *get_next_word (f_str_t *buf, f_str_t *token); +gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions); /* OSB tokenize function */ -int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); +int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, + GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); /* Common tokenizer for headers */ int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Make tokens for a subject */ |