diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-23 14:29:31 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-23 14:29:31 +0000 |
commit | 21a12878cc50c97444c41886b23e418087922783 (patch) | |
tree | c1f74997ac28d4355ebf2eb0997b0e6e2f22770c /src/libstat | |
parent | fec137a7cccd626ce248f619011b2570f75438f8 (diff) | |
download | rspamd-21a12878cc50c97444c41886b23e418087922783.tar.gz rspamd-21a12878cc50c97444c41886b23e418087922783.zip |
Rework tokenization:
- Use normalized words if needed
- Allow using of seeded XXHash instead of hand-made legacy shit
- Allow secure hashing using siphash
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/stat_config.c | 2 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 23 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 152 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 13 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 2 |
5 files changed, 147 insertions, 45 deletions
diff --git a/src/libstat/stat_config.c b/src/libstat/stat_config.c index 8a0514721..8b537f732 100644 --- a/src/libstat/stat_config.c +++ b/src/libstat/stat_config.c @@ -41,7 +41,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = { }; static struct rspamd_stat_tokenizer stat_tokenizers[] = { - {"osb-text", osb_tokenize_text}, + {"osb-text", rspamd_tokenizer_osb}, }; static struct rspamd_stat_backend stat_backends[] = { diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index eafbe2092..f5a4b9398 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, GArray *words; gchar *sub; GList *cur; + const ucl_object_t *elt; + gboolean compat = TRUE; + + /* + * XXX: Ugly repetition to be backward compatible + */ + if (cf != NULL && cf->opts != NULL) { + elt = ucl_object_find_key (cf->opts, "hash"); + if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { + if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) { + compat = FALSE; + } + } + } cur = task->text_parts; @@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, /* * XXX: Use normalized words if needed here */ - tok->tokenizer->tokenize_func (cf, task->task_pool, + + if (compat) { + tok->tokenizer->tokenize_func (cf, task->task_pool, part->words, tok->tokens, part->is_utf); + } + else { + tok->tokenizer->tokenize_func (cf, task->task_pool, + part->normalized_words, tok->tokens, part->is_utf); + } } cur = g_list_next (cur); diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index b51e909a9..18157acd1 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -28,17 +28,28 @@ #include "tokenizers.h" #include "stat_internal.h" +#include "libstemmer.h" +#include "xxhash.h" +#include "siphash.h" /* Size for features pipe */ -#define FEATURE_WINDOW_SIZE 5 - -/* Minimum length of token */ -#define MIN_LEN 4 - -extern const int primes[]; +#define DEFAULT_FEATURE_WINDOW_SIZE 5 + +static const int primes[] = { + 1, 7, + 3, 13, + 5, 29, + 11, 51, + 23, 101, + 47, 203, + 97, 407, + 197, 817, + 397, 1637, + 797, 3277, +}; int -osb_tokenize_text (struct rspamd_tokenizer_config *cf, +rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, rspamd_mempool_t * pool, GArray * input, GTree * tree, @@ -46,9 +57,15 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf, { rspamd_token_t *new = NULL; rspamd_fstring_t *token; - guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, processed = 0; - guint w; + const ucl_object_t *elt; + guint64 *hashpipe, cur; + guint32 h1, h2; + guint processed = 0, i, w, window_size = DEFAULT_FEATURE_WINDOW_SIZE; + gboolean compat = TRUE, secure = FALSE; + gint64 seed = 0xdeadbabe; + guchar *key = NULL; + gsize keylen; + struct sipkey sk; g_assert (tree != NULL); @@ -56,32 +73,100 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf, return FALSE; } - memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); + if (cf != NULL && cf->opts != NULL) { + elt = ucl_object_find_key (cf->opts, "hash"); + if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { + if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3) + == 0) { + compat = FALSE; + secure = FALSE; + elt = ucl_object_find_key (cf->opts, "seed"); + if (elt != NULL && ucl_object_type (elt) == UCL_INT) { + seed = ucl_object_toint (elt); + } + } + else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3) + == 0) { + compat = FALSE; + elt = ucl_object_find_key (cf->opts, "seed"); + + if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { + key = rspamd_decode_base32 (ucl_object_tostring (elt), + 0, &keylen); + if (keylen < 16) { + msg_warn ("siphash seed is too short: %s", keylen); + g_free (key); + } + else { + secure = TRUE; + sip_tokey (&sk, key); + g_free (key); + } + } + else { + msg_warn ("siphash cannot be used without seed"); + } + + } + } + elt = ucl_object_find_key (cf->opts, "window"); + if (elt != NULL && ucl_object_type (elt) == UCL_INT) { + window_size = ucl_object_toint (elt); + if (window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) { + msg_err ("too large window size: %d", window_size); + window_size = DEFAULT_FEATURE_WINDOW_SIZE; + } + } + } + + hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); + memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0])); for (w = 0; w < input->len; w ++) { token = &g_array_index (input, rspamd_fstring_t, w); - if (processed < FEATURE_WINDOW_SIZE) { + if (compat) { + cur = rspamd_fstrhash_lc (token, is_utf); + } + else { + /* We know that the words are normalized */ + if (!secure) { + cur = XXH64 (token->begin, token->len, seed); + } + else { + cur = siphash24 (token->begin, token->len, &sk); + } + } + + if (processed < window_size) { /* Just fill a hashpipe */ - hashpipe[FEATURE_WINDOW_SIZE - ++processed] = - rspamd_fstrhash_lc (token, is_utf); + hashpipe[window_size - ++processed] = cur; } else { /* Shift hashpipe */ - for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { + for (i = window_size - 1; i > 0; i--) { hashpipe[i] = hashpipe[i - 1]; } - hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); + hashpipe[0] = cur; processed++; - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * - primes[(i << 1) - 1]; + for (i = 1; i < window_size; i++) { new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); - new->datalen = sizeof(gint32) * 2; - memcpy(new->data, &h1, sizeof(h1)); - memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); + new->datalen = sizeof (gint64); + + if (compat) { + h1 = ((guint32)hashpipe[0]) * primes[0] + + ((guint32)hashpipe[i]) * primes[i << 1]; + h2 = ((guint32)hashpipe[0]) * primes[1] + + ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; + + memcpy(new->data, &h1, sizeof (h1)); + memcpy(new->data + sizeof (h1), &h2, sizeof (h2)); + } + else { + cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + memcpy (new->data, &cur, sizeof (cur)); + } if (g_tree_lookup (tree, new) == NULL) { g_tree_insert (tree, new, new); @@ -90,14 +175,23 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf, } } - if (processed <= FEATURE_WINDOW_SIZE) { + if (processed <= window_size) { for (i = 1; i < processed; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); - new->datalen = sizeof(gint32) * 2; - memcpy(new->data, &h1, sizeof(h1)); - memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); + new->datalen = sizeof (gint64); + + if (compat) { + h1 = ((guint32)hashpipe[0]) * primes[0] + + ((guint32)hashpipe[i]) * primes[i << 1]; + h2 = ((guint32)hashpipe[0]) * primes[1] + + ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; + memcpy(new->data, &h1, sizeof (h1)); + memcpy(new->data + sizeof (h1), &h2, sizeof (h2)); + } + else { + cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + memcpy (new->data, &cur, sizeof (cur)); + } if (g_tree_lookup (tree, new) == NULL) { g_tree_insert (tree, new, new); diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 6ec7b1e10..2abe0f318 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -30,19 +30,6 @@ #include "tokenizers.h" #include "stat_internal.h" -const int primes[] = { - 1, 7, - 3, 13, - 5, 29, - 11, 51, - 23, 101, - 47, 203, - 97, 407, - 197, 817, - 397, 1637, - 797, 3277, -}; - const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 0bc594842..bab18b00a 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, gsize min_len, GList **exceptions); /* OSB tokenize function */ -int osb_tokenize_text (struct rspamd_tokenizer_config *cf, +int rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, rspamd_mempool_t *pool, GArray *input, GTree *tokens, |