]> source.dussan.org Git - rspamd.git/commitdiff
Add preliminary shingles version.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 17 Dec 2014 16:44:52 +0000 (16:44 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 17 Dec 2014 16:44:52 +0000 (16:44 +0000)
contrib/siphash
src/libutil/CMakeLists.txt
src/libutil/shingles.c [new file with mode: 0644]
src/libutil/shingles.h [new file with mode: 0644]

index d8a25fc9442b069459d542edfa2e241ec49b6c82..17e54301cb629d5711af0f85eec0b760efce4b52 160000 (submodule)
@@ -1 +1 @@
-Subproject commit d8a25fc9442b069459d542edfa2e241ec49b6c82
+Subproject commit 17e54301cb629d5711af0f85eec0b760efce4b52
index d7894ea7cce16b86d7962ba3c444fbceee0e8191..03c6e81aab35b4ea54b3f66f281ee187a9363660 100644 (file)
@@ -14,6 +14,7 @@ SET(LIBRSPAMDUTILSRC
                                                                printf.c
                                                                radix.c
                                                                rrd.c
+                                                               shingles.c
                                                                trie.c
                                                                upstream.c
                                                                util.c)
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
new file mode 100644 (file)
index 0000000..287e587
--- /dev/null
@@ -0,0 +1,131 @@
+/* Copyright (c) 2014, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "shingles.h"
+#include "fstring.h"
+#include "siphash.h"
+
+#define SHINGLES_WINDOW 10
+
+static void
+rspamd_shingles_update_row (rspamd_fstring_t *in, struct siphash *h)
+{
+       int i;
+
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               sip24_update (&h[i], in->begin, in->len);
+       }
+}
+
+struct rspamd_shingle*
+rspamd_shingles_generate (GArray *input,
+               const guchar key[16],
+               rspamd_mempool_t *pool,
+               rspamd_shingles_filter filter,
+               gpointer filterd)
+{
+       struct rspamd_shingle *res;
+       GArray *hashes[RSPAMD_SHINGLE_SIZE];
+       struct sipkey keys[RSPAMD_SHINGLE_SIZE];
+       struct siphash h[RSPAMD_SHINGLE_SIZE];
+       guchar shabuf[32], *out_key;
+       const guchar *cur_key;
+       GChecksum *cksum;
+       gint i, j, beg = 0;
+       gsize shalen;
+
+       res = rspamd_mempool_alloc (pool, sizeof (*res));
+       cksum = g_checksum_new (G_CHECKSUM_SHA256);
+       cur_key = key;
+       out_key = (guchar *)&keys[0];
+
+       /* Init hashes pipes and keys */
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64),
+                               SHINGLES_WINDOW * 2);
+               /*
+                * To generate a set of hashes we just apply sha256 to the
+                * initial key as many times as many hashes are required and
+                * xor left and right parts of sha256 to get a single 16 bytes SIP key.
+                */
+               shalen = sizeof (shabuf);
+               g_checksum_update (cksum, cur_key, 16);
+               g_checksum_get_digest (cksum, shabuf, &shalen);
+
+               for (j = 0; j < 16; j ++) {
+                       out_key[j] = shabuf[j] ^ shabuf[sizeof(shabuf) - j - 1];
+               }
+               g_checksum_reset (cksum);
+               cur_key = out_key;
+               out_key += 16;
+               sip24_init (&h[i], &keys[i]);
+       }
+
+       g_checksum_free (cksum);
+
+       /* Now parse input words into a vector of hashes using rolling window */
+       for (i = 0; i < (gint)input->len; i ++) {
+               if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len - 1) {
+                       for (j = beg; j <= i; j ++) {
+                               rspamd_shingles_update_row (&g_array_index (input,
+                                               rspamd_fstring_t, j), h);
+                       }
+
+                       /* Now we need to create a new row here */
+                       for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+                               guint64 val;
+
+                               val = sip24_final (&h[i]);
+                               /* Reinit siphash state */
+                               sip24_init (&h[i], &keys[i]);
+                               g_array_append_val (hashes[i], val);
+                       }
+               }
+       }
+
+       /* Now we need to filter all hashes and make a shingles result */
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len,
+                               filterd);
+               g_array_free (hashes[i], TRUE);
+       }
+
+       return res;
+}
+
+
+guint64
+rspamd_shingles_default_filter (guint64 *input, gsize count,
+               gpointer ud)
+{
+       guint64 minimal = G_MAXUINT64;
+       gsize i;
+
+       for (i = 0; i < count; i ++) {
+               if (minimal > input[i]) {
+                       minimal = input[i];
+               }
+       }
+
+       return minimal;
+}
diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h
new file mode 100644 (file)
index 0000000..5711999
--- /dev/null
@@ -0,0 +1,66 @@
+/* Copyright (c) 2014, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SHINGLES_H_
+#define SHINGLES_H_
+
+#include "config.h"
+#include "mem_pool.h"
+
+#define RSPAMD_SHINGLE_SIZE 84
+
+struct rspamd_shingle {
+       guint64 hashes[RSPAMD_SHINGLE_SIZE];
+};
+
+/**
+ * Shingles filtering function
+ * @param input input array of hashes
+ * @param count number of hashes in the vector
+ * @return shingle value
+ */
+typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count,
+               gpointer ud);
+
+/**
+ * Generate shingles from the input of fixed size strings using lemmatizer
+ * if needed
+ * @param input array of `rspamd_fstring_t`
+ * @param key secret key used to generate shingles
+ * @param pool pool to allocate shigles array
+ * @param filter hashes filtering function
+ * @param filterd opaque data for filtering function
+ * @return shingles array
+ */
+struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
+               const guchar key[16],
+               rspamd_mempool_t *pool,
+               rspamd_shingles_filter filter,
+               gpointer filterd);
+
+/**
+ * Default filtering function
+ */
+guint64 rspamd_shingles_default_filter (guint64 *input, gsize count,
+               gpointer ud);
+
+#endif /* SHINGLES_H_ */