aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-19 12:15:34 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-19 12:15:34 +0000
commit123886d49937ccefc547079f57ca4ef2f11bc312 (patch)
treea124a38e3bca532cb52be6d24b7db6593428c5e6 /src
parentdd1134f06b174f1829d81b67d7c1a6aeaf8bcc1e (diff)
downloadrspamd-123886d49937ccefc547079f57ca4ef2f11bc312.tar.gz
rspamd-123886d49937ccefc547079f57ca4ef2f11bc312.zip
Several improvements to the shingles calculations.
Diffstat (limited to 'src')
-rw-r--r--src/libutil/shingles.c12
-rw-r--r--src/libutil/shingles.h6
2 files changed, 9 insertions, 9 deletions
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
index 9421c1529..fa49fdadd 100644
--- a/src/libutil/shingles.c
+++ b/src/libutil/shingles.c
@@ -63,6 +63,7 @@ rspamd_shingles_generate (GArray *input,
}
blake2b_init (&bs, BLAKE2B_OUTBYTES);
+ memset (h, 0, sizeof (h));
cur_key = key;
out_key = (guchar *)&keys[0];
@@ -85,14 +86,13 @@ rspamd_shingles_generate (GArray *input,
blake2b_init (&bs, BLAKE2B_OUTBYTES);
cur_key = out_key;
out_key += 16;
- memset (&h[i], 0, sizeof (h[0]));
sip24_init (&h[i], &keys[i]);
}
/* Now parse input words into a vector of hashes using rolling window */
- for (i = 0; i < (gint)input->len; i ++) {
- if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len - 1) {
- for (j = beg; j <= i; j ++) {
+ for (i = 0; i <= (gint)input->len; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+ for (j = beg; j < i; j ++) {
rspamd_shingles_update_row (&g_array_index (input,
rspamd_fstring_t, j), h);
}
@@ -114,7 +114,7 @@ rspamd_shingles_generate (GArray *input,
/* Now we need to filter all hashes and make a shingles result */
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len,
- filterd);
+ i, key, filterd);
g_array_free (hashes[i], TRUE);
}
@@ -124,7 +124,7 @@ rspamd_shingles_generate (GArray *input,
guint64
rspamd_shingles_default_filter (guint64 *input, gsize count,
- gpointer ud)
+ gint shno, const guchar *key, gpointer ud)
{
guint64 minimal = G_MAXUINT64;
gsize i;
diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h
index 61b3b24c3..d9025946e 100644
--- a/src/libutil/shingles.h
+++ b/src/libutil/shingles.h
@@ -26,7 +26,7 @@
#include "config.h"
#include "mem_pool.h"
-#define RSPAMD_SHINGLE_SIZE 23
+#define RSPAMD_SHINGLE_SIZE 32
struct rspamd_shingle {
guint64 hashes[RSPAMD_SHINGLE_SIZE];
@@ -39,7 +39,7 @@ struct rspamd_shingle {
* @return shingle value
*/
typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count,
- gpointer ud);
+ gint shno, const guchar *key, gpointer ud);
/**
* Generate shingles from the input of fixed size strings using lemmatizer
@@ -71,6 +71,6 @@ gdouble rspamd_shingles_compare (const struct rspamd_shingle *a,
* Default filtering function
*/
guint64 rspamd_shingles_default_filter (guint64 *input, gsize count,
- gpointer ud);
+ gint shno, const guchar *key, gpointer ud);
#endif /* SHINGLES_H_ */