aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-05-11 15:54:47 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-05-11 15:54:47 +0100
commit4932b636ef23b7b0c9c356c9144457039253945e (patch)
tree0f086aac6c37acc408a85fcfd45800144b63574c /src
parentbcb5eaadd0b810b399fbdb133253bbc9ec7f060e (diff)
downloadrspamd-4932b636ef23b7b0c9c356c9144457039253945e.tar.gz
rspamd-4932b636ef23b7b0c9c356c9144457039253945e.zip
[Feature] Add more algorithms for shingles generation
Diffstat (limited to 'src')
-rw-r--r--src/libutil/shingles.c74
-rw-r--r--src/libutil/shingles.h9
-rw-r--r--src/plugins/fuzzy_check.c3
3 files changed, 69 insertions, 17 deletions
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
index c2acae6d3..04a9975bc 100644
--- a/src/libutil/shingles.c
+++ b/src/libutil/shingles.c
@@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
- gpointer filterd)
+ gpointer filterd,
+ enum rspamd_shingle_alg alg)
{
struct rspamd_shingle *res;
GArray *hashes[RSPAMD_SHINGLE_SIZE];
@@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input,
rspamd_ftok_t *word;
rspamd_cryptobox_hash_state_t bs;
guint64 val;
- gint i, j, beg = 0;
+ gint i, j, k, beg = 0;
+ enum rspamd_cryptobox_fast_hash_type ht;
if (pool != NULL) {
res = rspamd_mempool_alloc (pool, sizeof (*res));
@@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input,
}
/* Now parse input words into a vector of hashes using rolling window */
- for (i = 0; i <= (gint)input->len; i ++) {
- if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
- for (j = beg; j < i; j ++) {
- word = &g_array_index (input, rspamd_ftok_t, j);
- row = rspamd_fstring_append (row, word->begin, word->len);
+ if (alg == RSPAMD_SHINGLES_OLD) {
+ for (i = 0; i <= (gint)input->len; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+ for (j = beg; j < i; j ++) {
+ word = &g_array_index (input, rspamd_ftok_t, j);
+ row = rspamd_fstring_append (row, word->begin, word->len);
+ }
+ beg++;
+
+ /* Now we need to create a new row here */
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+ rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
+ keys[j]);
+ g_array_append_val (hashes[j], val);
+ }
+
+ row = rspamd_fstring_assign (row, "", 0);
}
- beg++;
+ }
+ }
+ else {
+ guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE];
- /* Now we need to create a new row here */
- for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
- rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
- keys[j]);
- g_array_append_val (hashes[j], val);
- }
+ if (alg == RSPAMD_SHINGLES_XXHASH) {
+ ht = RSPAMD_CRYPTOBOX_XXHASH64;
+ }
+ else {
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ }
- row = rspamd_fstring_assign (row, "", 0);
+ memset (res, 0, sizeof (res));
+
+ for (i = 0; i <= (gint)input->len; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+ /* Shift hashes window to right */
+ for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
+ res[j * SHINGLES_WINDOW + k] =
+ res[j * SHINGLES_WINDOW + k + 1];
+ }
+
+ word = &g_array_index (input, rspamd_ftok_t, beg);
+ /* Insert the last element to the pipe */
+ res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+ rspamd_cryptobox_fast_hash_specific (ht,
+ word->begin, word->len,
+ *(guint64 *)keys[j]);
+
+ val = 0;
+ for (k = 0; k < SHINGLES_WINDOW; k ++) {
+ val ^= res[j * SHINGLES_WINDOW + k];
+ }
+
+ g_array_append_val (hashes[j], val);
+ }
+ beg++;
+ }
}
}
diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h
index d252a78f6..a46a2fc0f 100644
--- a/src/libutil/shingles.h
+++ b/src/libutil/shingles.h
@@ -25,6 +25,12 @@ struct rspamd_shingle {
guint64 hashes[RSPAMD_SHINGLE_SIZE];
};
+enum rspamd_shingle_alg {
+ RSPAMD_SHINGLES_OLD = 0,
+ RSPAMD_SHINGLES_XXHASH,
+ RSPAMD_SHINGLES_MUMHASH,
+};
+
/**
* Shingles filtering function
* @param input input array of hashes
@@ -48,7 +54,8 @@ struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
- gpointer filterd);
+ gpointer filterd,
+ enum rspamd_shingle_alg alg);
/**
* Compares two shingles and return result as a floating point value - 1.0
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index a5b62875e..304ce3704 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -1068,7 +1068,8 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
rule->shingles_key->str);
sh = rspamd_shingles_generate (words,
rule->shingles_key->str, pool,
- rspamd_shingles_default_filter, NULL);
+ rspamd_shingles_default_filter, NULL,
+ RSPAMD_SHINGLES_OLD);
if (sh != NULL) {
memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;