aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
commita3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch)
tree352c634bbbc74cf17644545ace66a8feedc841c3 /src/tokenizers
parent63725086863e4f422340479f83dd7ef374613e76 (diff)
downloadrspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz
rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip
* Welcome 0.4.0
Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c57
-rw-r--r--src/tokenizers/tokenizers.c64
-rw-r--r--src/tokenizers/tokenizers.h16
3 files changed, 86 insertions, 51 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 5f5dfcdcd..bc57255cb 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -36,55 +36,56 @@ extern const int primes[];
int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
- gboolean save_token, gboolean is_utf)
+ gboolean save_token, gboolean is_utf, GList *exceptions)
{
token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 }, *res;
- uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- int i;
-
- /* First set all bytes of hashpipe to some common value */
- for (i = 0; i < FEATURE_WINDOW_SIZE; i++) {
- hashpipe[i] = 0xABCDEF;
- }
+ f_str_t token = { NULL, 0, 0 };
+ guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+ gint i, k = 0, l;
+ gchar *res;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree);
}
- while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
+ while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
/* Skip small words */
if (is_utf) {
- if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
- continue;
- }
+ l = g_utf8_strlen (token.begin, token.len);
}
else {
- if (token.len < MIN_LEN) {
- continue;
- }
+ l = token.len;
}
+ if (l < MIN_LEN) {
+ token.begin = res;
+ continue;
+ }
+
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
hashpipe[i] = hashpipe[i - 1];
}
- hashpipe[0] = fstrhash (&token);
+ hashpipe[0] = fstrhash_lowercase (&token, is_utf);
- for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
- new = memory_pool_alloc0 (pool, sizeof (token_node_t));
- new->h1 = h1;
- new->h2 = h2;
- if (save_token) {
- new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token);
- }
+ if (k > FEATURE_WINDOW_SIZE) {
+ for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
+ h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+ new = memory_pool_alloc0 (pool, sizeof (token_node_t));
+ new->h1 = h1;
+ new->h2 = h2;
+ if (save_token) {
+ new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token);
+ }
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
+ if (g_tree_lookup (*tree, new) == NULL) {
+ g_tree_insert (*tree, new, new);
+ }
}
}
+ k ++;
+ token.begin = res;
}
return TRUE;
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 9e41a9101..be73e506d 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -52,7 +52,7 @@ const gchar t_delimiters[255] = {
1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
}
/* Get next word from specified f_str_t buf */
-f_str_t *
-get_next_word (f_str_t * buf, f_str_t * token)
+gchar *
+get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions)
{
- size_t remain;
- guchar *pos;
+ gsize remain, pos;
+ guchar *p;
+ struct process_exception *ex = NULL;
if (buf == NULL) {
return NULL;
}
+
+ if (*exceptions != NULL) {
+ ex = (*exceptions)->data;
+ }
+
if (token->begin == NULL) {
- token->begin = buf->begin;
+ if (ex != NULL) {
+ if (ex->pos == 0) {
+ token->begin = buf->begin + ex->len;
+ token->len = ex->len;
+ }
+ else {
+ token->begin = buf->begin;
+ token->len = 0;
+ }
+ }
+ else {
+ token->begin = buf->begin;
+ token->len = 0;
+ }
}
- token->begin = token->begin + token->len;
token->len = 0;
remain = buf->len - (token->begin - buf->begin);
if (remain <= 0) {
return NULL;
}
- pos = token->begin;
+ pos = token->begin - buf->begin;
+ p = token->begin;
/* Skip non delimiters symbols */
- while (remain > 0 && t_delimiters[*pos]) {
- token->begin++;
+ do {
+ if (ex != NULL && ex->pos == pos) {
+ /* Go to the next exception */
+ *exceptions = g_list_next (*exceptions);
+ return p + ex->len + 1;
+ }
pos++;
+ p++;
remain--;
- }
- while (remain > 0 && !t_delimiters[*pos]) {
+ } while (remain > 0 && t_delimiters[*p]);
+
+ token->begin = p;
+
+ while (remain > 0 && !t_delimiters[*p]) {
+ if (ex != NULL && ex->pos == pos) {
+ *exceptions = g_list_next (*exceptions);
+ return p + ex->len + 1;
+ }
token->len++;
pos++;
remain--;
+ p ++;
}
- if (token->len == 0) {
+ if (remain == 0) {
return NULL;
}
- return token;
+ return p;
}
/* Struct to access gmime headers */
@@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
subject.begin = task->subject;
subject.len = strlen (task->subject);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
+ osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
}
if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
subject.begin = (gchar *)sub;
subject.len = strlen (sub);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
+ osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
}
}
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index df5481a1f..c78d90b0e 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -15,17 +15,18 @@
#define FEATURE_WINDOW_SIZE 5
typedef struct token_node_s {
- uint32_t h1;
- uint32_t h2;
+ guint32 h1;
+ guint32 h2;
float value;
uintptr_t extra;
} token_node_t;
/* Common tokenizer structure */
struct tokenizer {
- char *name;
- int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
- f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
+ gchar *name;
+ gint (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input,
+ GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
+ gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions);
};
/* Compare two token nodes */
@@ -33,9 +34,10 @@ int token_node_compare_func (gconstpointer a, gconstpointer b);
/* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
-f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
+gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions);
/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input,
+ GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
/* Common tokenizer for headers */
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
/* Make tokens for a subject */