]> source.dussan.org Git - rspamd.git/commitdiff
Allow adding of prefix for tokenizers.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 26 Jul 2015 20:43:52 +0000 (21:43 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 26 Jul 2015 20:43:52 +0000 (21:43 +0100)
src/libstat/stat_process.c
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizers.h

index 536db9b16b018a7779aa531a1ca7ee8e1f697167..458c42cad3959c0655753f081fb2fb9af5744191 100644 (file)
@@ -318,11 +318,11 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
                if (!IS_PART_EMPTY (part) && part->words != NULL) {
                        if (compat) {
                                tok->tokenizer->tokenize_func (cf, task->task_pool,
-                                       part->words, tok->tokens, IS_PART_UTF (part));
+                                       part->words, tok->tokens, IS_PART_UTF (part), NULL);
                        }
                        else {
                                tok->tokenizer->tokenize_func (cf, task->task_pool,
-                                       part->normalized_words, tok->tokens, IS_PART_UTF (part));
+                                       part->normalized_words, tok->tokens, IS_PART_UTF (part), NULL);
                        }
                }
        }
@@ -342,7 +342,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
                                        task->task_pool,
                                        words,
                                        tok->tokens,
-                                       TRUE);
+                                       TRUE,
+                                       "SUBJECT");
                        g_array_free (words, TRUE);
                }
        }
index dd413a9b4736e6d81f4528174b3e7dd3d1654864..ae6eabb27d108cf0b395ccfd781a82f5db593efc 100644 (file)
@@ -212,12 +212,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
        rspamd_mempool_t * pool,
        GArray * input,
        GTree * tree,
-       gboolean is_utf)
+       gboolean is_utf,
+       const gchar *prefix)
 {
        rspamd_token_t *new = NULL;
        rspamd_fstring_t *token;
        struct rspamd_osb_tokenizer_config *osb_cf;
-       guint64 *hashpipe, cur;
+       guint64 *hashpipe, cur, seed;
        guint32 h1, h2;
        guint processed = 0, i, w, window_size;
 
@@ -236,6 +237,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
 
        window_size = osb_cf->window_size;
 
+       if (prefix) {
+               seed = XXH64 (prefix, strlen (prefix), osb_cf->seed);
+       }
+       else {
+               seed = osb_cf->seed;
+       }
+
        hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
        memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
 
@@ -253,6 +261,10 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
                        else {
                                rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
                                                token->len, osb_cf->sk);
+
+                               if (prefix) {
+                                       cur ^= seed;
+                               }
                        }
                }
 
index 2c96b7cfff38e7aa2627050d43bba3b830b14c3d..ef3fc8af05adc2598a5311f72c661127c6ff74ec 100644 (file)
@@ -19,7 +19,8 @@ struct rspamd_stat_tokenizer {
                        rspamd_mempool_t *pool,
                        GArray *words,
                        GTree *result,
-                       gboolean is_utf);
+                       gboolean is_utf,
+                       const gchar *prefix);
 };
 
 /* Compare two token nodes */
@@ -36,7 +37,8 @@ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
        rspamd_mempool_t *pool,
        GArray *input,
        GTree *tokens,
-       gboolean is_utf);
+       gboolean is_utf,
+       const gchar *prefix);
 
 gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf,
                gsize *len);