summaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-01-21 17:25:06 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-01-21 17:25:06 +0300
commit1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898 (patch)
treef0a714e2e87ebd50f6016c8cc7f2a8e03a9cc2d8 /src/tokenizers
parent87c9659fdd08bbbc0eb796afccf7237a03181498 (diff)
downloadrspamd-1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898.tar.gz
rspamd-1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898.zip
* Rewrite message parser
* Change mime parts storage * Add html tags striping (ported from php code) * Rework learning to process only text and striped html parts
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c21
-rw-r--r--src/tokenizers/tokenizers.c6
-rw-r--r--src/tokenizers/tokenizers.h4
3 files changed, 17 insertions, 14 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 451644675..122fa2241 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -20,11 +20,10 @@ static const int primes[] = {
797, 3277,
};
-GTree *
-osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
+int
+osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
{
token_node_t *new = NULL;
- GTree *tree;
f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -33,9 +32,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
hashpipe[i] = 0xABCDEF;
}
+
+ if (*tree == NULL) {
+ *tree = g_tree_new (token_node_compare_func);
+ memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
+ }
- tree = g_tree_new (token_node_compare_func);
- memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);
+ msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
@@ -43,7 +46,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
hashpipe[i] = hashpipe[i - 1];
}
hashpipe[0] = fstrhash (&token);
- msg_debug ("osb_tokenize_text: text token %s, hash: %d", fstrcstr (&token, pool), hashpipe[0]);
for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
@@ -52,14 +54,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
new->h1 = h1;
new->h2 = h2;
- if (g_tree_lookup (tree, new) == NULL) {
- msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
- g_tree_insert (tree, new, new);
+ if (g_tree_lookup (*tree, new) == NULL) {
+ g_tree_insert (*tree, new, new);
}
}
}
- return tree;
+ return TRUE;
}
/*
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index f0481e00d..6c92f9a97 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -60,13 +60,15 @@ get_next_word (f_str_t *buf, f_str_t *token)
pos = token->begin;
/* Skip non graph symbols */
- while (remain-- && !g_ascii_isgraph (*pos)) {
+ while (remain > 0 && !g_ascii_isgraph (*pos)) {
token->begin ++;
pos ++;
+ remain --;
}
- while (remain-- && g_ascii_isgraph (*pos)) {
+ while (remain > 0 && g_ascii_isgraph (*pos)) {
token->len ++;
pos ++;
+ remain --;
}
if (token->len == 0) {
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index c3453a945..10c8ae7aa 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -22,7 +22,7 @@ typedef struct token_node_s {
/* Common tokenizer structure */
struct tokenizer {
char *name;
- GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+ int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
};
@@ -33,7 +33,7 @@ struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
-GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
/* Array of all defined tokenizers */
extern struct tokenizer tokenizers[];