diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-01-05 16:59:02 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-01-05 16:59:02 +0000 |
commit | a142fd150c47668215f8cf9f75374b8e8434b7d9 (patch) | |
tree | 5cfa570a615e1f37ab57cee8f2b953094a7ad850 /src/libstat/tokenizers | |
parent | 46c0c532f5bcc555cd106a61a5e659706290ac78 (diff) | |
download | rspamd-a142fd150c47668215f8cf9f75374b8e8434b7d9.tar.gz rspamd-a142fd150c47668215f8cf9f75374b8e8434b7d9.zip |
Fix tokenization
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 173 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 35 |
2 files changed, 89 insertions, 119 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 2d1b3bb3e..55a0c6bba 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -189,6 +189,7 @@ rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, return osb_cf; } +#if 0 gboolean rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, gpointer ptr, gsize len) @@ -223,28 +224,68 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, return ret; } +gboolean +rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf; + + if (ptr == NULL || len == 0) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); + + if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) { + /* Trying to load incompatible configuration */ + msg_err_pool ("cannot load tokenizer configuration from a legacy " + "statfile; maybe you have forgotten to set 'compat' option" + " in the tokenizer configuration"); + + return FALSE; + } + } + else { + g_assert (len == sizeof (*osb_cf)); + osb_cf = ptr; + } + + rt->config = osb_cf; + rt->conf_len = sizeof (*osb_cf); + + return TRUE; +} + +gboolean +rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) +{ + struct rspamd_osb_tokenizer_config *osb_cf = rt->config; + + return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); +} +#endif + + + gint -rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, - rspamd_mempool_t * pool, - GArray * input, - gboolean is_utf, - const gchar *prefix) +rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, + rspamd_mempool_t *pool, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result) { - rspamd_token_t *new = NULL; + rspamd_token_t *new_tok = NULL; rspamd_ftok_t *token; struct rspamd_osb_tokenizer_config *osb_cf; guint64 *hashpipe, cur, seed; guint32 h1, h2; + gsize token_size; guint processed = 0, i, w, window_size; - GTree *tree = rt->tokens; - - g_assert (tree != NULL); - if (input == NULL) { + if (words == NULL) { return FALSE; } - osb_cf = rt->config; + osb_cf = ctx->tkcf; window_size = osb_cf->window_size; if (prefix) { @@ -256,9 +297,11 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0])); + token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len; + g_assert (token_size > 0); - for (w = 0; w < input->len; w ++) { - token = &g_array_index (input, rspamd_ftok_t, w); + for (w = 0; w < words->len; w ++) { + token = &g_array_index (words, rspamd_ftok_t, w); if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { cur = rspamd_fstrhash_lc (token, is_utf); @@ -278,6 +321,25 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, } } +#define ADD_TOKEN do {\ + new_tok = rspamd_mempool_alloc0 (pool, token_size); \ + new_tok->datalen = sizeof (gint64); \ + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ + h1 = ((guint32)hashpipe[0]) * primes[0] + \ + ((guint32)hashpipe[i]) * primes[i << 1]; \ + h2 = ((guint32)hashpipe[0]) * primes[1] + \ + ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; \ + memcpy(new_tok->data, &h1, sizeof (h1)); \ + memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \ + } \ + else { \ + cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; \ + memcpy (new_tok->data, &cur, sizeof (cur)); \ + } \ + new_tok->window_idx = i + 1; \ + g_ptr_array_add (result, new_tok); \ + } while(0) + if (processed < window_size) { /* Just fill a hashpipe */ hashpipe[window_size - ++processed] = cur; @@ -291,97 +353,20 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, processed++; for (i = 1; i < window_size; i++) { - new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); - new->datalen = sizeof (gint64); - - if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - h1 = ((guint32)hashpipe[0]) * primes[0] + - ((guint32)hashpipe[i]) * primes[i << 1]; - h2 = ((guint32)hashpipe[0]) * primes[1] + - ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; - - memcpy(new->data, &h1, sizeof (h1)); - memcpy(new->data + sizeof (h1), &h2, sizeof (h2)); - } - else { - cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - memcpy (new->data, &cur, sizeof (cur)); - } - - new->window_idx = i + 1; - - if (g_tree_lookup (tree, new) == NULL) { - g_tree_insert (tree, new, new); - } + ADD_TOKEN; } } } if (processed <= window_size) { memmove (hashpipe, hashpipe + (window_size - processed + 1), processed); - for (i = 1; i < processed; i++) { - new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); - new->datalen = sizeof (gint64); - - if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - h1 = ((guint32)hashpipe[0]) * primes[0] + - ((guint32)hashpipe[i]) * primes[i << 1]; - h2 = ((guint32)hashpipe[0]) * primes[1] + - ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; - memcpy(new->data, &h1, sizeof (h1)); - memcpy(new->data + sizeof (h1), &h2, sizeof (h2)); - } - else { - cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - memcpy (new->data, &cur, sizeof (cur)); - } - - new->window_idx = i + 1; - - if (g_tree_lookup (tree, new) == NULL) { - g_tree_insert (tree, new, new); - } - } - } - - return TRUE; -} - - -gboolean -rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len) -{ - struct rspamd_osb_tokenizer_config *osb_cf; - if (ptr == NULL || len == 0) { - osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); - - if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) { - /* Trying to load incompatible configuration */ - msg_err_pool ("cannot load tokenizer configuration from a legacy " - "statfile; maybe you have forgotten to set 'compat' option" - " in the tokenizer configuration"); - - return FALSE; + for (i = 1; i < processed; i++) { + ADD_TOKEN; } } - else { - g_assert (len == sizeof (*osb_cf)); - osb_cf = ptr; - } - rt->config = osb_cf; - rt->conf_len = sizeof (*osb_cf); +#undef ADD_TOKEN return TRUE; } - -gboolean -rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) -{ - struct rspamd_osb_tokenizer_config *osb_cf = rt->config; - - return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); -} diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index f4c9a5ed3..70ff7560c 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -10,23 +10,19 @@ #define RSPAMD_DEFAULT_TOKENIZER "osb" struct rspamd_tokenizer_runtime; +struct rspamd_stat_ctx; /* Common tokenizer structure */ struct rspamd_stat_tokenizer { gchar *name; gpointer (*get_config) (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); - gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - gboolean (*load_config) (rspamd_mempool_t *pool, - struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - gboolean (*is_compat) (struct rspamd_tokenizer_runtime *rt); - gint (*tokenize_func)(struct rspamd_tokenizer_runtime *rt, + gint (*tokenize_func)(struct rspamd_stat_ctx *ctx, rspamd_mempool_t *pool, GArray *words, gboolean is_utf, - const gchar *prefix); + const gchar *prefix, + GPtrArray *result); }; /* Compare two token nodes */ @@ -39,28 +35,17 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, guint64 *hash); /* OSB tokenize function */ -gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, - rspamd_mempool_t *pool, - GArray *input, - gboolean is_utf, - const gchar *prefix); +gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, + rspamd_mempool_t *pool, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); -gboolean -rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - -gboolean -rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_runtime *rt, - gpointer ptr, gsize len); - -gboolean -rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt); - #endif /* * vi:ts=4 |