diff options
Diffstat (limited to 'src/tokenizers/osb.c')
-rw-r--r-- | src/tokenizers/osb.c | 40 |
1 files changed, 25 insertions, 15 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 823e1e5b5..faa6a9669 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -26,32 +26,40 @@ * OSB tokenizer */ -#include <sys/types.h> #include "tokenizers.h" +#include <sys/types.h> /* Minimum length of token */ #define MIN_LEN 4 -extern const int primes[]; +extern const int primes[]; int -osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t * input, GTree ** tree, - gboolean save_token, gboolean is_utf, GList *exceptions) +osb_tokenize_text (struct tokenizer *tokenizer, + rspamd_mempool_t * pool, + f_str_t * input, + GTree ** tree, + gboolean save_token, + gboolean is_utf, + GList *exceptions) { - token_node_t *new = NULL; - f_str_t token = { NULL, 0, 0 }; - guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, l, processed = 0; - gchar *res; + token_node_t *new = NULL; + f_str_t token = { NULL, 0, 0 }; + guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + gint i, l, processed = 0; + gchar *res; if (*tree == NULL) { *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_tree_destroy, *tree); + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t) g_tree_destroy, + *tree); } memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); - while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { + while ((res = + tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { /* Skip small words */ if (is_utf) { l = g_utf8_strlen (token.begin, token.len); @@ -67,7 +75,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t if (processed < FEATURE_WINDOW_SIZE) { /* Just fill a hashpipe */ hashpipe[FEATURE_WINDOW_SIZE - ++processed] = - fstrhash_lowercase (&token, is_utf); + fstrhash_lowercase (&token, is_utf); } else { /* Shift hashpipe */ @@ -75,16 +83,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t hashpipe[i] = hashpipe[i - 1]; } hashpipe[0] = fstrhash_lowercase (&token, is_utf); - processed ++; + processed++; for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * + primes[(i << 1) - 1]; new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); new->h1 = h1; new->h2 = h2; if (save_token) { - new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, &token); + new->extra = + (uintptr_t)rspamd_mempool_fstrdup (pool, &token); } if (g_tree_lookup (*tree, new) == NULL) { |