aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-17 16:44:52 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-17 16:44:52 +0000
commit7eb5a5745b0df7a5b05dd6beee9a41e60cefa7e7 (patch)
tree4a1134991746cafcb01efb3046b32abf59ceb369
parent4178270ca52b46428b0aa9dc7fcf0fd7f03d8c7a (diff)
downloadrspamd-7eb5a5745b0df7a5b05dd6beee9a41e60cefa7e7.tar.gz
rspamd-7eb5a5745b0df7a5b05dd6beee9a41e60cefa7e7.zip
Add preliminary shingles version.
m---------contrib/siphash0
-rw-r--r--src/libutil/CMakeLists.txt1
-rw-r--r--src/libutil/shingles.c131
-rw-r--r--src/libutil/shingles.h66
4 files changed, 198 insertions, 0 deletions
diff --git a/contrib/siphash b/contrib/siphash
-Subproject d8a25fc9442b069459d542edfa2e241ec49b6c8
+Subproject 17e54301cb629d5711af0f85eec0b760efce4b5
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index d7894ea7c..03c6e81aa 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -14,6 +14,7 @@ SET(LIBRSPAMDUTILSRC
printf.c
radix.c
rrd.c
+ shingles.c
trie.c
upstream.c
util.c)
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
new file mode 100644
index 000000000..287e5875d
--- /dev/null
+++ b/src/libutil/shingles.c
@@ -0,0 +1,131 @@
+/* Copyright (c) 2014, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "shingles.h"
+#include "fstring.h"
+#include "siphash.h"
+
+#define SHINGLES_WINDOW 10
+
+static void
+rspamd_shingles_update_row (rspamd_fstring_t *in, struct siphash *h)
+{
+ int i;
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+ sip24_update (&h[i], in->begin, in->len);
+ }
+}
+
+struct rspamd_shingle*
+rspamd_shingles_generate (GArray *input,
+ const guchar key[16],
+ rspamd_mempool_t *pool,
+ rspamd_shingles_filter filter,
+ gpointer filterd)
+{
+ struct rspamd_shingle *res;
+ GArray *hashes[RSPAMD_SHINGLE_SIZE];
+ struct sipkey keys[RSPAMD_SHINGLE_SIZE];
+ struct siphash h[RSPAMD_SHINGLE_SIZE];
+ guchar shabuf[32], *out_key;
+ const guchar *cur_key;
+ GChecksum *cksum;
+ gint i, j, beg = 0;
+ gsize shalen;
+
+ res = rspamd_mempool_alloc (pool, sizeof (*res));
+ cksum = g_checksum_new (G_CHECKSUM_SHA256);
+ cur_key = key;
+ out_key = (guchar *)&keys[0];
+
+ /* Init hashes pipes and keys */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+ hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64),
+ SHINGLES_WINDOW * 2);
+ /*
+ * To generate a set of hashes we just apply sha256 to the
+ * initial key as many times as many hashes are required and
+ * xor left and right parts of sha256 to get a single 16 bytes SIP key.
+ */
+ shalen = sizeof (shabuf);
+ g_checksum_update (cksum, cur_key, 16);
+ g_checksum_get_digest (cksum, shabuf, &shalen);
+
+ for (j = 0; j < 16; j ++) {
+ out_key[j] = shabuf[j] ^ shabuf[sizeof(shabuf) - j - 1];
+ }
+ g_checksum_reset (cksum);
+ cur_key = out_key;
+ out_key += 16;
+ sip24_init (&h[i], &keys[i]);
+ }
+
+ g_checksum_free (cksum);
+
+ /* Now parse input words into a vector of hashes using rolling window */
+ for (i = 0; i < (gint)input->len; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len - 1) {
+ for (j = beg; j <= i; j ++) {
+ rspamd_shingles_update_row (&g_array_index (input,
+ rspamd_fstring_t, j), h);
+ }
+
+ /* Now we need to create a new row here */
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+ guint64 val;
+
+ val = sip24_final (&h[i]);
+ /* Reinit siphash state */
+ sip24_init (&h[i], &keys[i]);
+ g_array_append_val (hashes[i], val);
+ }
+ }
+ }
+
+ /* Now we need to filter all hashes and make a shingles result */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+ res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len,
+ filterd);
+ g_array_free (hashes[i], TRUE);
+ }
+
+ return res;
+}
+
+
+guint64
+rspamd_shingles_default_filter (guint64 *input, gsize count,
+ gpointer ud)
+{
+ guint64 minimal = G_MAXUINT64;
+ gsize i;
+
+ for (i = 0; i < count; i ++) {
+ if (minimal > input[i]) {
+ minimal = input[i];
+ }
+ }
+
+ return minimal;
+}
diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h
new file mode 100644
index 000000000..571199992
--- /dev/null
+++ b/src/libutil/shingles.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2014, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SHINGLES_H_
+#define SHINGLES_H_
+
+#include "config.h"
+#include "mem_pool.h"
+
+#define RSPAMD_SHINGLE_SIZE 84
+
+struct rspamd_shingle {
+ guint64 hashes[RSPAMD_SHINGLE_SIZE];
+};
+
+/**
+ * Shingles filtering function
+ * @param input input array of hashes
+ * @param count number of hashes in the vector
+ * @return shingle value
+ */
+typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count,
+ gpointer ud);
+
+/**
+ * Generate shingles from the input of fixed size strings using lemmatizer
+ * if needed
+ * @param input array of `rspamd_fstring_t`
+ * @param key secret key used to generate shingles
+ * @param pool pool to allocate shigles array
+ * @param filter hashes filtering function
+ * @param filterd opaque data for filtering function
+ * @return shingles array
+ */
+struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
+ const guchar key[16],
+ rspamd_mempool_t *pool,
+ rspamd_shingles_filter filter,
+ gpointer filterd);
+
+/**
+ * Default filtering function
+ */
+guint64 rspamd_shingles_default_filter (guint64 *input, gsize count,
+ gpointer ud);
+
+#endif /* SHINGLES_H_ */