Browse Source

[Feature] Add more algorithms for shingles generation

tags/1.3.0
Vsevolod Stakhov 8 years ago
parent
commit
4932b636ef
4 changed files with 88 additions and 29 deletions
  1. 59
    15
      src/libutil/shingles.c
  2. 8
    1
      src/libutil/shingles.h
  3. 2
    1
      src/plugins/fuzzy_check.c
  4. 19
    12
      test/rspamd_shingles_test.c

+ 59
- 15
src/libutil/shingles.c View File

@@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
gpointer filterd)
gpointer filterd,
enum rspamd_shingle_alg alg)
{
struct rspamd_shingle *res;
GArray *hashes[RSPAMD_SHINGLE_SIZE];
@@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input,
rspamd_ftok_t *word;
rspamd_cryptobox_hash_state_t bs;
guint64 val;
gint i, j, beg = 0;
gint i, j, k, beg = 0;
enum rspamd_cryptobox_fast_hash_type ht;

if (pool != NULL) {
res = rspamd_mempool_alloc (pool, sizeof (*res));
@@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input,
}

/* Now parse input words into a vector of hashes using rolling window */
for (i = 0; i <= (gint)input->len; i ++) {
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
for (j = beg; j < i; j ++) {
word = &g_array_index (input, rspamd_ftok_t, j);
row = rspamd_fstring_append (row, word->begin, word->len);
if (alg == RSPAMD_SHINGLES_OLD) {
for (i = 0; i <= (gint)input->len; i ++) {
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
for (j = beg; j < i; j ++) {
word = &g_array_index (input, rspamd_ftok_t, j);
row = rspamd_fstring_append (row, word->begin, word->len);
}
beg++;

/* Now we need to create a new row here */
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
keys[j]);
g_array_append_val (hashes[j], val);
}

row = rspamd_fstring_assign (row, "", 0);
}
beg++;
}
}
else {
guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE];

/* Now we need to create a new row here */
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
keys[j]);
g_array_append_val (hashes[j], val);
}
if (alg == RSPAMD_SHINGLES_XXHASH) {
ht = RSPAMD_CRYPTOBOX_XXHASH64;
}
else {
ht = RSPAMD_CRYPTOBOX_MUMHASH;
}

row = rspamd_fstring_assign (row, "", 0);
memset (res, 0, sizeof (res));

for (i = 0; i <= (gint)input->len; i ++) {
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {

for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
/* Shift hashes window to right */
for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
res[j * SHINGLES_WINDOW + k] =
res[j * SHINGLES_WINDOW + k + 1];
}

word = &g_array_index (input, rspamd_ftok_t, beg);
/* Insert the last element to the pipe */
res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
rspamd_cryptobox_fast_hash_specific (ht,
word->begin, word->len,
*(guint64 *)keys[j]);

val = 0;
for (k = 0; k < SHINGLES_WINDOW; k ++) {
val ^= res[j * SHINGLES_WINDOW + k];
}

g_array_append_val (hashes[j], val);
}
beg++;
}
}
}


+ 8
- 1
src/libutil/shingles.h View File

@@ -25,6 +25,12 @@ struct rspamd_shingle {
guint64 hashes[RSPAMD_SHINGLE_SIZE];
};

enum rspamd_shingle_alg {
RSPAMD_SHINGLES_OLD = 0,
RSPAMD_SHINGLES_XXHASH,
RSPAMD_SHINGLES_MUMHASH,
};

/**
* Shingles filtering function
* @param input input array of hashes
@@ -48,7 +54,8 @@ struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
gpointer filterd);
gpointer filterd,
enum rspamd_shingle_alg alg);

/**
* Compares two shingles and return result as a floating point value - 1.0

+ 2
- 1
src/plugins/fuzzy_check.c View File

@@ -1068,7 +1068,8 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
rule->shingles_key->str);
sh = rspamd_shingles_generate (words,
rule->shingles_key->str, pool,
rspamd_shingles_default_filter, NULL);
rspamd_shingles_default_filter, NULL,
RSPAMD_SHINGLES_OLD);
if (sh != NULL) {
memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;

+ 19
- 12
test/rspamd_shingles_test.c View File

@@ -81,7 +81,8 @@ free_fuzzy_words (GArray *ar)
}

static void
test_case (gsize cnt, gsize max_len, gdouble perm_factor)
test_case (gsize cnt, gsize max_len, gdouble perm_factor,
enum rspamd_shingle_alg alg)
{
GArray *input;
struct rspamd_shingle *sgl, *sgl_permuted;
@@ -93,17 +94,18 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor)
input = generate_fuzzy_words (cnt, max_len);
ts1 = rspamd_get_ticks ();
sgl = rspamd_shingles_generate (input, key, NULL,
rspamd_shingles_default_filter, NULL);
rspamd_shingles_default_filter, NULL, alg);
ts2 = rspamd_get_ticks ();
permute_vector (input, perm_factor);
sgl_permuted = rspamd_shingles_generate (input, key, NULL,
rspamd_shingles_default_filter, NULL);
rspamd_shingles_default_filter, NULL, alg);

res = rspamd_shingles_compare (sgl, sgl_permuted);

msg_debug ("percentage of common shingles: %.3f, generate time: %hd usec",
res, (gint)(ts1 - ts2) * 1000);
g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.20);
msg_info ("%d (%z words of %z max len, %.2f perm factor):"
" percentage of common shingles: %.3f, generate time: %.4f sec",
alg, cnt, max_len, perm_factor, res, ts2 - ts1);
g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.25);

free_fuzzy_words (input);
g_free (sgl);
@@ -113,10 +115,15 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor)
void
rspamd_shingles_test_func (void)
{
//test_case (5, 100, 0.5);
test_case (200, 10, 0.1);
test_case (500, 20, 0.01);
test_case (5000, 20, 0.01);
test_case (5000, 15, 0);
test_case (5000, 30, 1.0);
enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;

for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_MUMHASH; alg ++) {
test_case (200, 10, 0.1, alg);
test_case (500, 20, 0.01, alg);
test_case (5000, 20, 0.01, alg);
test_case (5000, 15, 0, alg);
test_case (5000, 30, 1.0, alg);
test_case (50000, 30, 0.02, alg);
test_case (50000, 5, 0.02, alg);
}
}

Loading…
Cancel
Save