summaryrefslogtreecommitdiffstats
path: root/src/tokenizers/osb.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizers/osb.c')
-rw-r--r--src/tokenizers/osb.c40
1 files changed, 25 insertions, 15 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 823e1e5b5..faa6a9669 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -26,32 +26,40 @@
* OSB tokenizer
*/
-#include <sys/types.h>
#include "tokenizers.h"
+#include <sys/types.h>
/* Minimum length of token */
#define MIN_LEN 4
-extern const int primes[];
+extern const int primes[];
int
-osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t * input, GTree ** tree,
- gboolean save_token, gboolean is_utf, GList *exceptions)
+osb_tokenize_text (struct tokenizer *tokenizer,
+ rspamd_mempool_t * pool,
+ f_str_t * input,
+ GTree ** tree,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions)
{
- token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 };
- guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- gint i, l, processed = 0;
- gchar *res;
+ token_node_t *new = NULL;
+ f_str_t token = { NULL, 0, 0 };
+ guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+ gint i, l, processed = 0;
+ gchar *res;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+ rspamd_mempool_add_destructor (pool,
+ (rspamd_mempool_destruct_t) g_tree_destroy,
+ *tree);
}
memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
- while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
+ while ((res =
+ tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
/* Skip small words */
if (is_utf) {
l = g_utf8_strlen (token.begin, token.len);
@@ -67,7 +75,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t
if (processed < FEATURE_WINDOW_SIZE) {
/* Just fill a hashpipe */
hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
- fstrhash_lowercase (&token, is_utf);
+ fstrhash_lowercase (&token, is_utf);
}
else {
/* Shift hashpipe */
@@ -75,16 +83,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t
hashpipe[i] = hashpipe[i - 1];
}
hashpipe[0] = fstrhash_lowercase (&token, is_utf);
- processed ++;
+ processed++;
for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] *
+ primes[(i << 1) - 1];
new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
new->h1 = h1;
new->h2 = h2;
if (save_token) {
- new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, &token);
+ new->extra =
+ (uintptr_t)rspamd_mempool_fstrdup (pool, &token);
}
if (g_tree_lookup (*tree, new) == NULL) {