aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-26 21:43:52 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-26 21:43:52 +0100
commit277ffc972d161a0928bc07c2e5d110b00115744c (patch)
tree246b76ade650c124e987f9ff4a662ded8b23d07e /src/libstat
parente38b84cd48f8c465b94395a5641f85209ab22b68 (diff)
downloadrspamd-277ffc972d161a0928bc07c2e5d110b00115744c.tar.gz
rspamd-277ffc972d161a0928bc07c2e5d110b00115744c.zip
Allow adding of prefix for tokenizers.
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/stat_process.c7
-rw-r--r--src/libstat/tokenizers/osb.c16
-rw-r--r--src/libstat/tokenizers/tokenizers.h6
3 files changed, 22 insertions, 7 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 536db9b16..458c42cad 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -318,11 +318,11 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
if (!IS_PART_EMPTY (part) && part->words != NULL) {
if (compat) {
tok->tokenizer->tokenize_func (cf, task->task_pool,
- part->words, tok->tokens, IS_PART_UTF (part));
+ part->words, tok->tokens, IS_PART_UTF (part), NULL);
}
else {
tok->tokenizer->tokenize_func (cf, task->task_pool,
- part->normalized_words, tok->tokens, IS_PART_UTF (part));
+ part->normalized_words, tok->tokens, IS_PART_UTF (part), NULL);
}
}
}
@@ -342,7 +342,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
task->task_pool,
words,
tok->tokens,
- TRUE);
+ TRUE,
+ "SUBJECT");
g_array_free (words, TRUE);
}
}
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index dd413a9b4..ae6eabb27 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -212,12 +212,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
rspamd_mempool_t * pool,
GArray * input,
GTree * tree,
- gboolean is_utf)
+ gboolean is_utf,
+ const gchar *prefix)
{
rspamd_token_t *new = NULL;
rspamd_fstring_t *token;
struct rspamd_osb_tokenizer_config *osb_cf;
- guint64 *hashpipe, cur;
+ guint64 *hashpipe, cur, seed;
guint32 h1, h2;
guint processed = 0, i, w, window_size;
@@ -236,6 +237,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
window_size = osb_cf->window_size;
+ if (prefix) {
+ seed = XXH64 (prefix, strlen (prefix), osb_cf->seed);
+ }
+ else {
+ seed = osb_cf->seed;
+ }
+
hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
@@ -253,6 +261,10 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
else {
rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
token->len, osb_cf->sk);
+
+ if (prefix) {
+ cur ^= seed;
+ }
}
}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 2c96b7cff..ef3fc8af0 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -19,7 +19,8 @@ struct rspamd_stat_tokenizer {
rspamd_mempool_t *pool,
GArray *words,
GTree *result,
- gboolean is_utf);
+ gboolean is_utf,
+ const gchar *prefix);
};
/* Compare two token nodes */
@@ -36,7 +37,8 @@ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
rspamd_mempool_t *pool,
GArray *input,
GTree *tokens,
- gboolean is_utf);
+ gboolean is_utf,
+ const gchar *prefix);
gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf,
gsize *len);