diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-28 19:07:26 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-28 19:07:26 +0400 |
commit | b3c36d4946f675619b81c9223f5ac1a86c55c55c (patch) | |
tree | 6cdd79cae18ce387f6c00f8ce23aef65b4a5c02b /src/tokenizers | |
parent | 0e6a4235b1794a61d12fcde33cffaf8dd83c51f0 (diff) | |
download | rspamd-b3c36d4946f675619b81c9223f5ac1a86c55c55c.tar.gz rspamd-b3c36d4946f675619b81c9223f5ac1a86c55c55c.zip |
* Add correcting factor to statistics.
Now learning increments version of a statfile.
Avoid learning and classifying of similar text parts if a message has 2 text parts.
Several fixes to statistics.
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 29 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 2 |
2 files changed, 15 insertions, 16 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index bc57255cb..790069d6a 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -41,7 +41,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * token_node_t *new = NULL; f_str_t token = { NULL, 0, 0 }; guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, k = 0, l; + gint i, l; gchar *res; if (*tree == NULL) { @@ -49,6 +49,8 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree); } + memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); + while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { /* Skip small words */ if (is_utf) { @@ -68,23 +70,20 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * } hashpipe[0] = fstrhash_lowercase (&token, is_utf); - if (k > FEATURE_WINDOW_SIZE) { - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = memory_pool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); - } + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = memory_pool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); + } - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); } } - k ++; token.begin = res; } diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 16dc763ed..d5a820d1b 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -138,7 +138,7 @@ get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions) token->len = 0; remain = buf->len - (token->begin - buf->begin); - if (remain <= 0) { + if (remain == 0) { return NULL; } pos = token->begin - buf->begin; |