aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-01-05 16:59:02 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-01-05 16:59:02 +0000
commita142fd150c47668215f8cf9f75374b8e8434b7d9 (patch)
tree5cfa570a615e1f37ab57cee8f2b953094a7ad850 /src/libstat/tokenizers
parent46c0c532f5bcc555cd106a61a5e659706290ac78 (diff)
downloadrspamd-a142fd150c47668215f8cf9f75374b8e8434b7d9.tar.gz
rspamd-a142fd150c47668215f8cf9f75374b8e8434b7d9.zip
Fix tokenization
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/osb.c173
-rw-r--r--src/libstat/tokenizers/tokenizers.h35
2 files changed, 89 insertions, 119 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 2d1b3bb3e..55a0c6bba 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -189,6 +189,7 @@ rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
return osb_cf;
}
+#if 0
gboolean
rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
gpointer ptr, gsize len)
@@ -223,28 +224,68 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
return ret;
}
+gboolean
+rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
+ struct rspamd_tokenizer_runtime *rt,
+ gpointer ptr, gsize len)
+{
+ struct rspamd_osb_tokenizer_config *osb_cf;
+
+ if (ptr == NULL || len == 0) {
+ osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
+
+ if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
+ /* Trying to load incompatible configuration */
+ msg_err_pool ("cannot load tokenizer configuration from a legacy "
+ "statfile; maybe you have forgotten to set 'compat' option"
+ " in the tokenizer configuration");
+
+ return FALSE;
+ }
+ }
+ else {
+ g_assert (len == sizeof (*osb_cf));
+ osb_cf = ptr;
+ }
+
+ rt->config = osb_cf;
+ rt->conf_len = sizeof (*osb_cf);
+
+ return TRUE;
+}
+
+gboolean
+rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
+{
+ struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
+
+ return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
+}
+#endif
+
+
+
gint
-rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
- rspamd_mempool_t * pool,
- GArray * input,
- gboolean is_utf,
- const gchar *prefix)
+rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result)
{
- rspamd_token_t *new = NULL;
+ rspamd_token_t *new_tok = NULL;
rspamd_ftok_t *token;
struct rspamd_osb_tokenizer_config *osb_cf;
guint64 *hashpipe, cur, seed;
guint32 h1, h2;
+ gsize token_size;
guint processed = 0, i, w, window_size;
- GTree *tree = rt->tokens;
-
- g_assert (tree != NULL);
- if (input == NULL) {
+ if (words == NULL) {
return FALSE;
}
- osb_cf = rt->config;
+ osb_cf = ctx->tkcf;
window_size = osb_cf->window_size;
if (prefix) {
@@ -256,9 +297,11 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
+ token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len;
+ g_assert (token_size > 0);
- for (w = 0; w < input->len; w ++) {
- token = &g_array_index (input, rspamd_ftok_t, w);
+ for (w = 0; w < words->len; w ++) {
+ token = &g_array_index (words, rspamd_ftok_t, w);
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
cur = rspamd_fstrhash_lc (token, is_utf);
@@ -278,6 +321,25 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
}
}
+#define ADD_TOKEN do {\
+ new_tok = rspamd_mempool_alloc0 (pool, token_size); \
+ new_tok->datalen = sizeof (gint64); \
+ if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
+ h1 = ((guint32)hashpipe[0]) * primes[0] + \
+ ((guint32)hashpipe[i]) * primes[i << 1]; \
+ h2 = ((guint32)hashpipe[0]) * primes[1] + \
+ ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; \
+ memcpy(new_tok->data, &h1, sizeof (h1)); \
+ memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \
+ } \
+ else { \
+ cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; \
+ memcpy (new_tok->data, &cur, sizeof (cur)); \
+ } \
+ new_tok->window_idx = i + 1; \
+ g_ptr_array_add (result, new_tok); \
+ } while(0)
+
if (processed < window_size) {
/* Just fill a hashpipe */
hashpipe[window_size - ++processed] = cur;
@@ -291,97 +353,20 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
processed++;
for (i = 1; i < window_size; i++) {
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
- new->datalen = sizeof (gint64);
-
- if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
- h1 = ((guint32)hashpipe[0]) * primes[0] +
- ((guint32)hashpipe[i]) * primes[i << 1];
- h2 = ((guint32)hashpipe[0]) * primes[1] +
- ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
-
- memcpy(new->data, &h1, sizeof (h1));
- memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
- }
- else {
- cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- memcpy (new->data, &cur, sizeof (cur));
- }
-
- new->window_idx = i + 1;
-
- if (g_tree_lookup (tree, new) == NULL) {
- g_tree_insert (tree, new, new);
- }
+ ADD_TOKEN;
}
}
}
if (processed <= window_size) {
memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
- for (i = 1; i < processed; i++) {
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
- new->datalen = sizeof (gint64);
-
- if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
- h1 = ((guint32)hashpipe[0]) * primes[0] +
- ((guint32)hashpipe[i]) * primes[i << 1];
- h2 = ((guint32)hashpipe[0]) * primes[1] +
- ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
- memcpy(new->data, &h1, sizeof (h1));
- memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
- }
- else {
- cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- memcpy (new->data, &cur, sizeof (cur));
- }
-
- new->window_idx = i + 1;
-
- if (g_tree_lookup (tree, new) == NULL) {
- g_tree_insert (tree, new, new);
- }
- }
- }
-
- return TRUE;
-}
-
-
-gboolean
-rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len)
-{
- struct rspamd_osb_tokenizer_config *osb_cf;
- if (ptr == NULL || len == 0) {
- osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
-
- if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
- /* Trying to load incompatible configuration */
- msg_err_pool ("cannot load tokenizer configuration from a legacy "
- "statfile; maybe you have forgotten to set 'compat' option"
- " in the tokenizer configuration");
-
- return FALSE;
+ for (i = 1; i < processed; i++) {
+ ADD_TOKEN;
}
}
- else {
- g_assert (len == sizeof (*osb_cf));
- osb_cf = ptr;
- }
- rt->config = osb_cf;
- rt->conf_len = sizeof (*osb_cf);
+#undef ADD_TOKEN
return TRUE;
}
-
-gboolean
-rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
-{
- struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
-
- return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
-}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index f4c9a5ed3..70ff7560c 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -10,23 +10,19 @@
#define RSPAMD_DEFAULT_TOKENIZER "osb"
struct rspamd_tokenizer_runtime;
+struct rspamd_stat_ctx;
/* Common tokenizer structure */
struct rspamd_stat_tokenizer {
gchar *name;
gpointer (*get_config) (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf, gsize *len);
- gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
- gboolean (*load_config) (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
- gboolean (*is_compat) (struct rspamd_tokenizer_runtime *rt);
- gint (*tokenize_func)(struct rspamd_tokenizer_runtime *rt,
+ gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
rspamd_mempool_t *pool,
GArray *words,
gboolean is_utf,
- const gchar *prefix);
+ const gchar *prefix,
+ GPtrArray *result);
};
/* Compare two token nodes */
@@ -39,28 +35,17 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
guint64 *hash);
/* OSB tokenize function */
-gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
- rspamd_mempool_t *pool,
- GArray *input,
- gboolean is_utf,
- const gchar *prefix);
+gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,
gsize *len);
-gboolean
-rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
-
-gboolean
-rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_runtime *rt,
- gpointer ptr, gsize len);
-
-gboolean
-rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt);
-
#endif
/*
* vi:ts=4