@@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input, | |||
const guchar key[16], | |||
rspamd_mempool_t *pool, | |||
rspamd_shingles_filter filter, | |||
gpointer filterd) | |||
gpointer filterd, | |||
enum rspamd_shingle_alg alg) | |||
{ | |||
struct rspamd_shingle *res; | |||
GArray *hashes[RSPAMD_SHINGLE_SIZE]; | |||
@@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input, | |||
rspamd_ftok_t *word; | |||
rspamd_cryptobox_hash_state_t bs; | |||
guint64 val; | |||
gint i, j, beg = 0; | |||
gint i, j, k, beg = 0; | |||
enum rspamd_cryptobox_fast_hash_type ht; | |||
if (pool != NULL) { | |||
res = rspamd_mempool_alloc (pool, sizeof (*res)); | |||
@@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input, | |||
} | |||
/* Now parse input words into a vector of hashes using rolling window */ | |||
for (i = 0; i <= (gint)input->len; i ++) { | |||
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { | |||
for (j = beg; j < i; j ++) { | |||
word = &g_array_index (input, rspamd_ftok_t, j); | |||
row = rspamd_fstring_append (row, word->begin, word->len); | |||
if (alg == RSPAMD_SHINGLES_OLD) { | |||
for (i = 0; i <= (gint)input->len; i ++) { | |||
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { | |||
for (j = beg; j < i; j ++) { | |||
word = &g_array_index (input, rspamd_ftok_t, j); | |||
row = rspamd_fstring_append (row, word->begin, word->len); | |||
} | |||
beg++; | |||
/* Now we need to create a new row here */ | |||
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { | |||
rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, | |||
keys[j]); | |||
g_array_append_val (hashes[j], val); | |||
} | |||
row = rspamd_fstring_assign (row, "", 0); | |||
} | |||
beg++; | |||
} | |||
} | |||
else { | |||
guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE]; | |||
/* Now we need to create a new row here */ | |||
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { | |||
rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, | |||
keys[j]); | |||
g_array_append_val (hashes[j], val); | |||
} | |||
if (alg == RSPAMD_SHINGLES_XXHASH) { | |||
ht = RSPAMD_CRYPTOBOX_XXHASH64; | |||
} | |||
else { | |||
ht = RSPAMD_CRYPTOBOX_MUMHASH; | |||
} | |||
row = rspamd_fstring_assign (row, "", 0); | |||
memset (res, 0, sizeof (res)); | |||
for (i = 0; i <= (gint)input->len; i ++) { | |||
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { | |||
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { | |||
/* Shift hashes window to right */ | |||
for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { | |||
res[j * SHINGLES_WINDOW + k] = | |||
res[j * SHINGLES_WINDOW + k + 1]; | |||
} | |||
word = &g_array_index (input, rspamd_ftok_t, beg); | |||
/* Insert the last element to the pipe */ | |||
res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = | |||
rspamd_cryptobox_fast_hash_specific (ht, | |||
word->begin, word->len, | |||
*(guint64 *)keys[j]); | |||
val = 0; | |||
for (k = 0; k < SHINGLES_WINDOW; k ++) { | |||
val ^= res[j * SHINGLES_WINDOW + k]; | |||
} | |||
g_array_append_val (hashes[j], val); | |||
} | |||
beg++; | |||
} | |||
} | |||
} | |||
@@ -25,6 +25,12 @@ struct rspamd_shingle { | |||
guint64 hashes[RSPAMD_SHINGLE_SIZE]; | |||
}; | |||
enum rspamd_shingle_alg { | |||
RSPAMD_SHINGLES_OLD = 0, | |||
RSPAMD_SHINGLES_XXHASH, | |||
RSPAMD_SHINGLES_MUMHASH, | |||
}; | |||
/** | |||
* Shingles filtering function | |||
* @param input input array of hashes | |||
@@ -48,7 +54,8 @@ struct rspamd_shingle* rspamd_shingles_generate (GArray *input, | |||
const guchar key[16], | |||
rspamd_mempool_t *pool, | |||
rspamd_shingles_filter filter, | |||
gpointer filterd); | |||
gpointer filterd, | |||
enum rspamd_shingle_alg alg); | |||
/** | |||
* Compares two shingles and return result as a floating point value - 1.0 |
@@ -1068,7 +1068,8 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, | |||
rule->shingles_key->str); | |||
sh = rspamd_shingles_generate (words, | |||
rule->shingles_key->str, pool, | |||
rspamd_shingles_default_filter, NULL); | |||
rspamd_shingles_default_filter, NULL, | |||
RSPAMD_SHINGLES_OLD); | |||
if (sh != NULL) { | |||
memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); | |||
shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; |
@@ -81,7 +81,8 @@ free_fuzzy_words (GArray *ar) | |||
} | |||
static void | |||
test_case (gsize cnt, gsize max_len, gdouble perm_factor) | |||
test_case (gsize cnt, gsize max_len, gdouble perm_factor, | |||
enum rspamd_shingle_alg alg) | |||
{ | |||
GArray *input; | |||
struct rspamd_shingle *sgl, *sgl_permuted; | |||
@@ -93,17 +94,18 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor) | |||
input = generate_fuzzy_words (cnt, max_len); | |||
ts1 = rspamd_get_ticks (); | |||
sgl = rspamd_shingles_generate (input, key, NULL, | |||
rspamd_shingles_default_filter, NULL); | |||
rspamd_shingles_default_filter, NULL, alg); | |||
ts2 = rspamd_get_ticks (); | |||
permute_vector (input, perm_factor); | |||
sgl_permuted = rspamd_shingles_generate (input, key, NULL, | |||
rspamd_shingles_default_filter, NULL); | |||
rspamd_shingles_default_filter, NULL, alg); | |||
res = rspamd_shingles_compare (sgl, sgl_permuted); | |||
msg_debug ("percentage of common shingles: %.3f, generate time: %hd usec", | |||
res, (gint)(ts1 - ts2) * 1000); | |||
g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.20); | |||
msg_info ("%d (%z words of %z max len, %.2f perm factor):" | |||
" percentage of common shingles: %.3f, generate time: %.4f sec", | |||
alg, cnt, max_len, perm_factor, res, ts2 - ts1); | |||
g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.25); | |||
free_fuzzy_words (input); | |||
g_free (sgl); | |||
@@ -113,10 +115,15 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor) | |||
void | |||
rspamd_shingles_test_func (void) | |||
{ | |||
//test_case (5, 100, 0.5); | |||
test_case (200, 10, 0.1); | |||
test_case (500, 20, 0.01); | |||
test_case (5000, 20, 0.01); | |||
test_case (5000, 15, 0); | |||
test_case (5000, 30, 1.0); | |||
enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD; | |||
for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_MUMHASH; alg ++) { | |||
test_case (200, 10, 0.1, alg); | |||
test_case (500, 20, 0.01, alg); | |||
test_case (5000, 20, 0.01, alg); | |||
test_case (5000, 15, 0, alg); | |||
test_case (5000, 30, 1.0, alg); | |||
test_case (50000, 30, 0.02, alg); | |||
test_case (50000, 5, 0.02, alg); | |||
} | |||
} |