From: Vsevolod Stakhov Date: Fri, 19 Dec 2014 12:15:34 +0000 (+0000) Subject: Several improvements to the shingles calculations. X-Git-Tag: 0.8.0~69 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=123886d49937ccefc547079f57ca4ef2f11bc312;p=rspamd.git Several improvements to the shingles calculations. --- diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index 9421c1529..fa49fdadd 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -63,6 +63,7 @@ rspamd_shingles_generate (GArray *input, } blake2b_init (&bs, BLAKE2B_OUTBYTES); + memset (h, 0, sizeof (h)); cur_key = key; out_key = (guchar *)&keys[0]; @@ -85,14 +86,13 @@ rspamd_shingles_generate (GArray *input, blake2b_init (&bs, BLAKE2B_OUTBYTES); cur_key = out_key; out_key += 16; - memset (&h[i], 0, sizeof (h[0])); sip24_init (&h[i], &keys[i]); } /* Now parse input words into a vector of hashes using rolling window */ - for (i = 0; i < (gint)input->len; i ++) { - if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len - 1) { - for (j = beg; j <= i; j ++) { + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + for (j = beg; j < i; j ++) { rspamd_shingles_update_row (&g_array_index (input, rspamd_fstring_t, j), h); } @@ -114,7 +114,7 @@ rspamd_shingles_generate (GArray *input, /* Now we need to filter all hashes and make a shingles result */ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len, - filterd); + i, key, filterd); g_array_free (hashes[i], TRUE); } @@ -124,7 +124,7 @@ rspamd_shingles_generate (GArray *input, guint64 rspamd_shingles_default_filter (guint64 *input, gsize count, - gpointer ud) + gint shno, const guchar *key, gpointer ud) { guint64 minimal = G_MAXUINT64; gsize i; diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index 61b3b24c3..d9025946e 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -26,7 +26,7 @@ #include "config.h" #include "mem_pool.h" -#define RSPAMD_SHINGLE_SIZE 23 +#define RSPAMD_SHINGLE_SIZE 32 struct rspamd_shingle { guint64 hashes[RSPAMD_SHINGLE_SIZE]; @@ -39,7 +39,7 @@ struct rspamd_shingle { * @return shingle value */ typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count, - gpointer ud); + gint shno, const guchar *key, gpointer ud); /** * Generate shingles from the input of fixed size strings using lemmatizer @@ -71,6 +71,6 @@ gdouble rspamd_shingles_compare (const struct rspamd_shingle *a, * Default filtering function */ guint64 rspamd_shingles_default_filter (guint64 *input, gsize count, - gpointer ud); + gint shno, const guchar *key, gpointer ud); #endif /* SHINGLES_H_ */