]> source.dussan.org Git - rspamd.git/commitdiff
Fix tokenization
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 5 Jan 2016 16:59:02 +0000 (16:59 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 5 Jan 2016 16:59:02 +0000 (16:59 +0000)
src/libstat/stat_config.c
src/libstat/stat_internal.h
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizers.h

index 662cfa79903010bf2684ef1f39172afcb8d5fb98..baf757ac7c9793437203fbd7ed4ab48fceb6ee85 100644 (file)
@@ -44,18 +44,12 @@ static struct rspamd_stat_tokenizer stat_tokenizers[] = {
        {
                .name = "osb-text",
                .get_config = rspamd_tokenizer_osb_get_config,
-               .compatible_config = rspamd_tokenizer_osb_compatible_config,
                .tokenize_func = rspamd_tokenizer_osb,
-               .load_config = rspamd_tokenizer_osb_load_config,
-               .is_compat = rspamd_tokenizer_osb_is_compat
        },
        {
                .name = "osb",
                .get_config = rspamd_tokenizer_osb_get_config,
-               .compatible_config = rspamd_tokenizer_osb_compatible_config,
                .tokenize_func = rspamd_tokenizer_osb,
-               .load_config = rspamd_tokenizer_osb_load_config,
-               .is_compat = rspamd_tokenizer_osb_is_compat
        },
 };
 
index 1e9d5dd44133be9654cfdfd32547ef7ca9b2e88d..2dd3f2fb52a36c158ad784a4962ec135b417288b 100644 (file)
@@ -35,15 +35,6 @@ enum stat_process_stage {
        RSPAMD_STAT_STAGE_POST
 };
 
-struct rspamd_tokenizer_runtime {
-       GTree *tokens;
-       const gchar *name;
-       struct rspamd_stat_tokenizer *tokenizer;
-       struct rspamd_tokenizer_config *tkcf;
-       gpointer config;
-       gsize conf_len;
-};
-
 struct rspamd_statfile_runtime {
        struct rspamd_statfile_config *st;
        gpointer backend_runtime;
@@ -90,7 +81,7 @@ typedef struct token_node_s {
        guchar data[RSPAMD_MAX_TOKEN_LEN];
        guint window_idx;
        guint datalen;
-       gdouble values[1];
+       gdouble values[0];
 } rspamd_token_t;
 
 struct rspamd_stat_ctx {
index 2d1b3bb3e8d6c4c5919485567dc2d85b104c2a43..55a0c6bba47f9354b90b56f68483fb91630b76a3 100644 (file)
@@ -189,6 +189,7 @@ rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
        return osb_cf;
 }
 
+#if 0
 gboolean
 rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
                        gpointer ptr, gsize len)
@@ -223,28 +224,68 @@ rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
        return ret;
 }
 
+gboolean
+rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
+               struct rspamd_tokenizer_runtime *rt,
+               gpointer ptr, gsize len)
+{
+       struct rspamd_osb_tokenizer_config *osb_cf;
+
+       if (ptr == NULL || len == 0) {
+               osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
+
+               if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
+                       /* Trying to load incompatible configuration */
+                       msg_err_pool ("cannot load tokenizer configuration from a legacy "
+                                       "statfile; maybe you have forgotten to set 'compat' option"
+                                       " in the tokenizer configuration");
+
+                       return FALSE;
+               }
+       }
+       else {
+               g_assert (len == sizeof (*osb_cf));
+               osb_cf = ptr;
+       }
+
+       rt->config = osb_cf;
+       rt->conf_len = sizeof (*osb_cf);
+
+       return TRUE;
+}
+
+gboolean
+rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
+{
+       struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
+
+       return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
+}
+#endif
+
+
+
 gint
-rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
-       rspamd_mempool_t * pool,
-       GArray * input,
-       gboolean is_utf,
-       const gchar *prefix)
+rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
+               rspamd_mempool_t *pool,
+               GArray *words,
+               gboolean is_utf,
+               const gchar *prefix,
+               GPtrArray *result)
 {
-       rspamd_token_t *new = NULL;
+       rspamd_token_t *new_tok = NULL;
        rspamd_ftok_t *token;
        struct rspamd_osb_tokenizer_config *osb_cf;
        guint64 *hashpipe, cur, seed;
        guint32 h1, h2;
+       gsize token_size;
        guint processed = 0, i, w, window_size;
-       GTree *tree = rt->tokens;
-
-       g_assert (tree != NULL);
 
-       if (input == NULL) {
+       if (words == NULL) {
                return FALSE;
        }
 
-       osb_cf = rt->config;
+       osb_cf = ctx->tkcf;
        window_size = osb_cf->window_size;
 
        if (prefix) {
@@ -256,9 +297,11 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
 
        hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
        memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
+       token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len;
+       g_assert (token_size > 0);
 
-       for (w = 0; w < input->len; w ++) {
-               token = &g_array_index (input, rspamd_ftok_t, w);
+       for (w = 0; w < words->len; w ++) {
+               token = &g_array_index (words, rspamd_ftok_t, w);
 
                if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
                        cur = rspamd_fstrhash_lc (token, is_utf);
@@ -278,6 +321,25 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
                        }
                }
 
+#define ADD_TOKEN do {\
+    new_tok = rspamd_mempool_alloc0 (pool, token_size); \
+    new_tok->datalen = sizeof (gint64); \
+    if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
+        h1 = ((guint32)hashpipe[0]) * primes[0] + \
+            ((guint32)hashpipe[i]) * primes[i << 1]; \
+        h2 = ((guint32)hashpipe[0]) * primes[1] + \
+            ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; \
+        memcpy(new_tok->data, &h1, sizeof (h1)); \
+        memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \
+    } \
+    else { \
+        cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; \
+        memcpy (new_tok->data, &cur, sizeof (cur)); \
+    } \
+    new_tok->window_idx = i + 1; \
+    g_ptr_array_add (result, new_tok); \
+  } while(0)
+
                if (processed < window_size) {
                        /* Just fill a hashpipe */
                        hashpipe[window_size - ++processed] = cur;
@@ -291,97 +353,20 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
                        processed++;
 
                        for (i = 1; i < window_size; i++) {
-                               new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-                               new->datalen = sizeof (gint64);
-
-                               if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
-                                       h1 = ((guint32)hashpipe[0]) * primes[0] +
-                                                       ((guint32)hashpipe[i]) * primes[i << 1];
-                                       h2 = ((guint32)hashpipe[0]) * primes[1] +
-                                                       ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
-
-                                       memcpy(new->data, &h1, sizeof (h1));
-                                       memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
-                               }
-                               else {
-                                       cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                                       memcpy (new->data, &cur, sizeof (cur));
-                               }
-
-                               new->window_idx = i + 1;
-
-                               if (g_tree_lookup (tree, new) == NULL) {
-                                       g_tree_insert (tree, new, new);
-                               }
+                               ADD_TOKEN;
                        }
                }
        }
 
        if (processed <= window_size) {
                memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
-               for (i = 1; i < processed; i++) {
-                       new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-                       new->datalen = sizeof (gint64);
-
-                       if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
-                               h1 = ((guint32)hashpipe[0]) * primes[0] +
-                                               ((guint32)hashpipe[i]) * primes[i << 1];
-                               h2 = ((guint32)hashpipe[0]) * primes[1] +
-                                               ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
-                               memcpy(new->data, &h1, sizeof (h1));
-                               memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
-                       }
-                       else {
-                               cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                               memcpy (new->data, &cur, sizeof (cur));
-                       }
-
-                       new->window_idx = i + 1;
-
-                       if (g_tree_lookup (tree, new) == NULL) {
-                               g_tree_insert (tree, new, new);
-                       }
-               }
-       }
-
-       return TRUE;
-}
-
-
-gboolean
-rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
-               struct rspamd_tokenizer_runtime *rt,
-               gpointer ptr, gsize len)
-{
-       struct rspamd_osb_tokenizer_config *osb_cf;
 
-       if (ptr == NULL || len == 0) {
-               osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
-
-               if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
-                       /* Trying to load incompatible configuration */
-                       msg_err_pool ("cannot load tokenizer configuration from a legacy "
-                                       "statfile; maybe you have forgotten to set 'compat' option"
-                                       " in the tokenizer configuration");
-
-                       return FALSE;
+               for (i = 1; i < processed; i++) {
+                       ADD_TOKEN;
                }
        }
-       else {
-               g_assert (len == sizeof (*osb_cf));
-               osb_cf = ptr;
-       }
 
-       rt->config = osb_cf;
-       rt->conf_len = sizeof (*osb_cf);
+#undef ADD_TOKEN
 
        return TRUE;
 }
-
-gboolean
-rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
-{
-       struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
-
-       return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
-}
index f4c9a5ed364ee3489b214ab711447fbdcee74127..70ff7560cdb75de79462417dd51fc6007d945bf3 100644 (file)
 #define RSPAMD_DEFAULT_TOKENIZER "osb"
 
 struct rspamd_tokenizer_runtime;
+struct rspamd_stat_ctx;
 
 /* Common tokenizer structure */
 struct rspamd_stat_tokenizer {
        gchar *name;
        gpointer (*get_config) (rspamd_mempool_t *pool,
                        struct rspamd_tokenizer_config *cf, gsize *len);
-       gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt,
-                       gpointer ptr, gsize len);
-       gboolean (*load_config) (rspamd_mempool_t *pool,
-                       struct rspamd_tokenizer_runtime *rt,
-                       gpointer ptr, gsize len);
-       gboolean (*is_compat) (struct rspamd_tokenizer_runtime *rt);
-       gint (*tokenize_func)(struct rspamd_tokenizer_runtime *rt,
+       gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
                        rspamd_mempool_t *pool,
                        GArray *words,
                        gboolean is_utf,
-                       const gchar *prefix);
+                       const gchar *prefix,
+                       GPtrArray *result);
 };
 
 /* Compare two token nodes */
@@ -39,28 +35,17 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                guint64 *hash);
 
 /* OSB tokenize function */
-gint rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
-       rspamd_mempool_t *pool,
-       GArray *input,
-       gboolean is_utf,
-       const gchar *prefix);
+gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
+               rspamd_mempool_t *pool,
+               GArray *words,
+               gboolean is_utf,
+               const gchar *prefix,
+               GPtrArray *result);
 
 gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
                struct rspamd_tokenizer_config *cf,
                gsize *len);
 
-gboolean
-rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
-                       gpointer ptr, gsize len);
-
-gboolean
-rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
-               struct rspamd_tokenizer_runtime *rt,
-               gpointer ptr, gsize len);
-
-gboolean
-rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt);
-
 #endif
 /*
  * vi:ts=4