const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
- gpointer filterd)
+ gpointer filterd,
+ enum rspamd_shingle_alg alg)
{
struct rspamd_shingle *res;
GArray *hashes[RSPAMD_SHINGLE_SIZE];
rspamd_ftok_t *word;
rspamd_cryptobox_hash_state_t bs;
guint64 val;
- gint i, j, beg = 0;
+ gint i, j, k, beg = 0;
+ enum rspamd_cryptobox_fast_hash_type ht;
if (pool != NULL) {
res = rspamd_mempool_alloc (pool, sizeof (*res));
}
/* Now parse input words into a vector of hashes using rolling window */
- for (i = 0; i <= (gint)input->len; i ++) {
- if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
- for (j = beg; j < i; j ++) {
- word = &g_array_index (input, rspamd_ftok_t, j);
- row = rspamd_fstring_append (row, word->begin, word->len);
+ if (alg == RSPAMD_SHINGLES_OLD) {
+ for (i = 0; i <= (gint)input->len; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+ for (j = beg; j < i; j ++) {
+ word = &g_array_index (input, rspamd_ftok_t, j);
+ row = rspamd_fstring_append (row, word->begin, word->len);
+ }
+ beg++;
+
+ /* Now we need to create a new row here */
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+ rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
+ keys[j]);
+ g_array_append_val (hashes[j], val);
+ }
+
+ row = rspamd_fstring_assign (row, "", 0);
}
- beg++;
+ }
+ }
+ else {
+ guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE];
- /* Now we need to create a new row here */
- for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
- rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
- keys[j]);
- g_array_append_val (hashes[j], val);
- }
+ if (alg == RSPAMD_SHINGLES_XXHASH) {
+ ht = RSPAMD_CRYPTOBOX_XXHASH64;
+ }
+ else {
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ }
- row = rspamd_fstring_assign (row, "", 0);
+ memset (res, 0, sizeof (res));
+
+ for (i = 0; i <= (gint)input->len; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+ /* Shift hashes window to right */
+ for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
+ res[j * SHINGLES_WINDOW + k] =
+ res[j * SHINGLES_WINDOW + k + 1];
+ }
+
+ word = &g_array_index (input, rspamd_ftok_t, beg);
+ /* Insert the last element to the pipe */
+ res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+ rspamd_cryptobox_fast_hash_specific (ht,
+ word->begin, word->len,
+ *(guint64 *)keys[j]);
+
+ val = 0;
+ for (k = 0; k < SHINGLES_WINDOW; k ++) {
+ val ^= res[j * SHINGLES_WINDOW + k];
+ }
+
+ g_array_append_val (hashes[j], val);
+ }
+ beg++;
+ }
}
}
guint64 hashes[RSPAMD_SHINGLE_SIZE];
};
+enum rspamd_shingle_alg {
+ RSPAMD_SHINGLES_OLD = 0,
+ RSPAMD_SHINGLES_XXHASH,
+ RSPAMD_SHINGLES_MUMHASH,
+};
+
/**
* Shingles filtering function
* @param input input array of hashes
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
- gpointer filterd);
+ gpointer filterd,
+ enum rspamd_shingle_alg alg);
/**
* Compares two shingles and return result as a floating point value - 1.0
rule->shingles_key->str);
sh = rspamd_shingles_generate (words,
rule->shingles_key->str, pool,
- rspamd_shingles_default_filter, NULL);
+ rspamd_shingles_default_filter, NULL,
+ RSPAMD_SHINGLES_OLD);
if (sh != NULL) {
memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
}
static void
-test_case (gsize cnt, gsize max_len, gdouble perm_factor)
+test_case (gsize cnt, gsize max_len, gdouble perm_factor,
+ enum rspamd_shingle_alg alg)
{
GArray *input;
struct rspamd_shingle *sgl, *sgl_permuted;
input = generate_fuzzy_words (cnt, max_len);
ts1 = rspamd_get_ticks ();
sgl = rspamd_shingles_generate (input, key, NULL,
- rspamd_shingles_default_filter, NULL);
+ rspamd_shingles_default_filter, NULL, alg);
ts2 = rspamd_get_ticks ();
permute_vector (input, perm_factor);
sgl_permuted = rspamd_shingles_generate (input, key, NULL,
- rspamd_shingles_default_filter, NULL);
+ rspamd_shingles_default_filter, NULL, alg);
res = rspamd_shingles_compare (sgl, sgl_permuted);
- msg_debug ("percentage of common shingles: %.3f, generate time: %hd usec",
- res, (gint)(ts1 - ts2) * 1000);
- g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.20);
+ msg_info ("%d (%z words of %z max len, %.2f perm factor):"
+ " percentage of common shingles: %.3f, generate time: %.4f sec",
+ alg, cnt, max_len, perm_factor, res, ts2 - ts1);
+ g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.25);
free_fuzzy_words (input);
g_free (sgl);
void
rspamd_shingles_test_func (void)
{
- //test_case (5, 100, 0.5);
- test_case (200, 10, 0.1);
- test_case (500, 20, 0.01);
- test_case (5000, 20, 0.01);
- test_case (5000, 15, 0);
- test_case (5000, 30, 1.0);
+ enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
+
+ for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_MUMHASH; alg ++) {
+ test_case (200, 10, 0.1, alg);
+ test_case (500, 20, 0.01, alg);
+ test_case (5000, 20, 0.01, alg);
+ test_case (5000, 15, 0, alg);
+ test_case (5000, 30, 1.0, alg);
+ test_case (50000, 30, 0.02, alg);
+ test_case (50000, 5, 0.02, alg);
+ }
}