diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-01-21 17:25:06 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-01-21 17:25:06 +0300 |
commit | 1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898 (patch) | |
tree | f0a714e2e87ebd50f6016c8cc7f2a8e03a9cc2d8 /src/tokenizers | |
parent | 87c9659fdd08bbbc0eb796afccf7237a03181498 (diff) | |
download | rspamd-1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898.tar.gz rspamd-1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898.zip |
* Rewrite message parser
* Change mime parts storage
* Add html tags striping (ported from php code)
* Rework learning to process only text and striped html parts
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 21 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 6 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 4 |
3 files changed, 17 insertions, 14 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 451644675..122fa2241 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -20,11 +20,10 @@ static const int primes[] = { 797, 3277, }; -GTree * -osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) +int +osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree) { token_node_t *new = NULL; - GTree *tree; f_str_t token = { NULL, 0, 0 }; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; int i; @@ -33,9 +32,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) { hashpipe[i] = 0xABCDEF; } + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree); + } - tree = g_tree_new (token_node_compare_func); - memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree); + msg_debug ("osb_tokenize_text: got input length: %zd", input->len); while (tokenizer->get_next_word (input, &token)) { /* Shift hashpipe */ @@ -43,7 +46,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in hashpipe[i] = hashpipe[i - 1]; } hashpipe[0] = fstrhash (&token); - msg_debug ("osb_tokenize_text: text token %s, hash: %d", fstrcstr (&token, pool), hashpipe[0]); for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) { h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; @@ -52,14 +54,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in new->h1 = h1; new->h2 = h2; - if (g_tree_lookup (tree, new) == NULL) { - msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); - g_tree_insert (tree, new, new); + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); } } } - return tree; + return TRUE; } /* diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index f0481e00d..6c92f9a97 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -60,13 +60,15 @@ get_next_word (f_str_t *buf, f_str_t *token) pos = token->begin; /* Skip non graph symbols */ - while (remain-- && !g_ascii_isgraph (*pos)) { + while (remain > 0 && !g_ascii_isgraph (*pos)) { token->begin ++; pos ++; + remain --; } - while (remain-- && g_ascii_isgraph (*pos)) { + while (remain > 0 && g_ascii_isgraph (*pos)) { token->len ++; pos ++; + remain --; } if (token->len == 0) { diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index c3453a945..10c8ae7aa 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -22,7 +22,7 @@ typedef struct token_node_s { /* Common tokenizer structure */ struct tokenizer { char *name; - GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); + int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); }; @@ -33,7 +33,7 @@ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ f_str_t *get_next_word (f_str_t *buf, f_str_t *token); /* OSB tokenize function */ -GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); +int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); /* Array of all defined tokenizers */ extern struct tokenizer tokenizers[]; |