aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-23 14:29:31 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-23 14:29:31 +0000
commit21a12878cc50c97444c41886b23e418087922783 (patch)
treec1f74997ac28d4355ebf2eb0997b0e6e2f22770c /src/libstat
parentfec137a7cccd626ce248f619011b2570f75438f8 (diff)
downloadrspamd-21a12878cc50c97444c41886b23e418087922783.tar.gz
rspamd-21a12878cc50c97444c41886b23e418087922783.zip
Rework tokenization:
- Use normalized words if needed - Allow using of seeded XXHash instead of hand-made legacy shit - Allow secure hashing using siphash
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/stat_config.c2
-rw-r--r--src/libstat/stat_process.c23
-rw-r--r--src/libstat/tokenizers/osb.c152
-rw-r--r--src/libstat/tokenizers/tokenizers.c13
-rw-r--r--src/libstat/tokenizers/tokenizers.h2
5 files changed, 147 insertions, 45 deletions
diff --git a/src/libstat/stat_config.c b/src/libstat/stat_config.c
index 8a0514721..8b537f732 100644
--- a/src/libstat/stat_config.c
+++ b/src/libstat/stat_config.c
@@ -41,7 +41,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = {
};
static struct rspamd_stat_tokenizer stat_tokenizers[] = {
- {"osb-text", osb_tokenize_text},
+ {"osb-text", rspamd_tokenizer_osb},
};
static struct rspamd_stat_backend stat_backends[] = {
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index eafbe2092..f5a4b9398 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
GArray *words;
gchar *sub;
GList *cur;
+ const ucl_object_t *elt;
+ gboolean compat = TRUE;
+
+ /*
+ * XXX: Ugly repetition to be backward compatible
+ */
+ if (cf != NULL && cf->opts != NULL) {
+ elt = ucl_object_find_key (cf->opts, "hash");
+ if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+ if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) {
+ compat = FALSE;
+ }
+ }
+ }
cur = task->text_parts;
@@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
/*
* XXX: Use normalized words if needed here
*/
- tok->tokenizer->tokenize_func (cf, task->task_pool,
+
+ if (compat) {
+ tok->tokenizer->tokenize_func (cf, task->task_pool,
part->words, tok->tokens, part->is_utf);
+ }
+ else {
+ tok->tokenizer->tokenize_func (cf, task->task_pool,
+ part->normalized_words, tok->tokens, part->is_utf);
+ }
}
cur = g_list_next (cur);
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index b51e909a9..18157acd1 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -28,17 +28,28 @@
#include "tokenizers.h"
#include "stat_internal.h"
+#include "libstemmer.h"
+#include "xxhash.h"
+#include "siphash.h"
/* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
+#define DEFAULT_FEATURE_WINDOW_SIZE 5
+
+static const int primes[] = {
+ 1, 7,
+ 3, 13,
+ 5, 29,
+ 11, 51,
+ 23, 101,
+ 47, 203,
+ 97, 407,
+ 197, 817,
+ 397, 1637,
+ 797, 3277,
+};
int
-osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
rspamd_mempool_t * pool,
GArray * input,
GTree * tree,
@@ -46,9 +57,15 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
{
rspamd_token_t *new = NULL;
rspamd_fstring_t *token;
- guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- gint i, processed = 0;
- guint w;
+ const ucl_object_t *elt;
+ guint64 *hashpipe, cur;
+ guint32 h1, h2;
+ guint processed = 0, i, w, window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+ gboolean compat = TRUE, secure = FALSE;
+ gint64 seed = 0xdeadbabe;
+ guchar *key = NULL;
+ gsize keylen;
+ struct sipkey sk;
g_assert (tree != NULL);
@@ -56,32 +73,100 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
return FALSE;
}
- memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+ if (cf != NULL && cf->opts != NULL) {
+ elt = ucl_object_find_key (cf->opts, "hash");
+ if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+ if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3)
+ == 0) {
+ compat = FALSE;
+ secure = FALSE;
+ elt = ucl_object_find_key (cf->opts, "seed");
+ if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+ seed = ucl_object_toint (elt);
+ }
+ }
+ else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3)
+ == 0) {
+ compat = FALSE;
+ elt = ucl_object_find_key (cf->opts, "seed");
+
+ if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+ key = rspamd_decode_base32 (ucl_object_tostring (elt),
+ 0, &keylen);
+ if (keylen < 16) {
+ msg_warn ("siphash seed is too short: %s", keylen);
+ g_free (key);
+ }
+ else {
+ secure = TRUE;
+ sip_tokey (&sk, key);
+ g_free (key);
+ }
+ }
+ else {
+ msg_warn ("siphash cannot be used without seed");
+ }
+
+ }
+ }
+ elt = ucl_object_find_key (cf->opts, "window");
+ if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+ window_size = ucl_object_toint (elt);
+ if (window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) {
+ msg_err ("too large window size: %d", window_size);
+ window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+ }
+ }
+ }
+
+ hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
+ memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
for (w = 0; w < input->len; w ++) {
token = &g_array_index (input, rspamd_fstring_t, w);
- if (processed < FEATURE_WINDOW_SIZE) {
+ if (compat) {
+ cur = rspamd_fstrhash_lc (token, is_utf);
+ }
+ else {
+ /* We know that the words are normalized */
+ if (!secure) {
+ cur = XXH64 (token->begin, token->len, seed);
+ }
+ else {
+ cur = siphash24 (token->begin, token->len, &sk);
+ }
+ }
+
+ if (processed < window_size) {
/* Just fill a hashpipe */
- hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
- rspamd_fstrhash_lc (token, is_utf);
+ hashpipe[window_size - ++processed] = cur;
}
else {
/* Shift hashpipe */
- for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+ for (i = window_size - 1; i > 0; i--) {
hashpipe[i] = hashpipe[i - 1];
}
- hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+ hashpipe[0] = cur;
processed++;
- for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] *
- primes[(i << 1) - 1];
+ for (i = 1; i < window_size; i++) {
new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
- new->datalen = sizeof(gint32) * 2;
- memcpy(new->data, &h1, sizeof(h1));
- memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+ new->datalen = sizeof (gint64);
+
+ if (compat) {
+ h1 = ((guint32)hashpipe[0]) * primes[0] +
+ ((guint32)hashpipe[i]) * primes[i << 1];
+ h2 = ((guint32)hashpipe[0]) * primes[1] +
+ ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+
+ memcpy(new->data, &h1, sizeof (h1));
+ memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+ }
+ else {
+ cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+ memcpy (new->data, &cur, sizeof (cur));
+ }
if (g_tree_lookup (tree, new) == NULL) {
g_tree_insert (tree, new, new);
@@ -90,14 +175,23 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
}
}
- if (processed <= FEATURE_WINDOW_SIZE) {
+ if (processed <= window_size) {
for (i = 1; i < processed; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
- new->datalen = sizeof(gint32) * 2;
- memcpy(new->data, &h1, sizeof(h1));
- memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+ new->datalen = sizeof (gint64);
+
+ if (compat) {
+ h1 = ((guint32)hashpipe[0]) * primes[0] +
+ ((guint32)hashpipe[i]) * primes[i << 1];
+ h2 = ((guint32)hashpipe[0]) * primes[1] +
+ ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+ memcpy(new->data, &h1, sizeof (h1));
+ memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+ }
+ else {
+ cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+ memcpy (new->data, &cur, sizeof (cur));
+ }
if (g_tree_lookup (tree, new) == NULL) {
g_tree_insert (tree, new, new);
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 6ec7b1e10..2abe0f318 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -30,19 +30,6 @@
#include "tokenizers.h"
#include "stat_internal.h"
-const int primes[] = {
- 1, 7,
- 3, 13,
- 5, 29,
- 11, 51,
- 23, 101,
- 47, 203,
- 97, 407,
- 197, 817,
- 397, 1637,
- 797, 3277,
-};
-
const gchar t_delimiters[255] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 0bc594842..bab18b00a 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
gsize min_len, GList **exceptions);
/* OSB tokenize function */
-int osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+int rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
rspamd_mempool_t *pool,
GArray *input,
GTree *tokens,