diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2013-07-01 17:47:00 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2013-07-01 17:47:00 +0100 |
commit | 7f6ebe1b2903cb1c6b4a0425d0c9925ecc53a2e2 (patch) | |
tree | 6750db16632498552acd80a7d82fe68e556e2d33 /src | |
parent | a51086aa8de0c0a53f94cfb24f0e23a7bdc944ac (diff) | |
download | rspamd-7f6ebe1b2903cb1c6b4a0425d0c9925ecc53a2e2.tar.gz rspamd-7f6ebe1b2903cb1c6b4a0425d0c9925ecc53a2e2.zip |
Reduce tokenization noize to improve bayes.
Diffstat (limited to 'src')
-rw-r--r-- | src/tokenizers/osb.c | 39 |
1 files changed, 32 insertions, 7 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index da3ba1270..c59536b31 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -41,7 +41,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * token_node_t *new = NULL; f_str_t token = { NULL, 0, 0 }; guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, l; + gint i, l, processed = 0; gchar *res; if (*tree == NULL) { @@ -64,13 +64,39 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * continue; } - /* Shift hashpipe */ - for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { - hashpipe[i] = hashpipe[i - 1]; + if (processed < FEATURE_WINDOW_SIZE) { + /* Just fill a hashpipe */ + hashpipe[FEATURE_WINDOW_SIZE - ++processed] = + fstrhash_lowercase (&token, is_utf); } - hashpipe[0] = fstrhash_lowercase (&token, is_utf); + else { + /* Shift hashpipe */ + for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0] = fstrhash_lowercase (&token, is_utf); + processed ++; + + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = memory_pool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); + } - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + token.begin = res; + } + + if (processed <= FEATURE_WINDOW_SIZE) { + for (i = 1; i < processed; i++) { h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; new = memory_pool_alloc0 (pool, sizeof (token_node_t)); @@ -84,7 +110,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * g_tree_insert (*tree, new, new); } } - token.begin = res; } return TRUE; |