From: Vsevolod Stakhov Date: Wed, 17 Dec 2014 16:44:52 +0000 (+0000) Subject: Add preliminary shingles version. X-Git-Tag: 0.8.0~78^2~7 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=7eb5a5745b0df7a5b05dd6beee9a41e60cefa7e7;p=rspamd.git Add preliminary shingles version. --- diff --git a/contrib/siphash b/contrib/siphash index d8a25fc94..17e54301c 160000 --- a/contrib/siphash +++ b/contrib/siphash @@ -1 +1 @@ -Subproject commit d8a25fc9442b069459d542edfa2e241ec49b6c82 +Subproject commit 17e54301cb629d5711af0f85eec0b760efce4b52 diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt index d7894ea7c..03c6e81aa 100644 --- a/src/libutil/CMakeLists.txt +++ b/src/libutil/CMakeLists.txt @@ -14,6 +14,7 @@ SET(LIBRSPAMDUTILSRC printf.c radix.c rrd.c + shingles.c trie.c upstream.c util.c) diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c new file mode 100644 index 000000000..287e5875d --- /dev/null +++ b/src/libutil/shingles.c @@ -0,0 +1,131 @@ +/* Copyright (c) 2014, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "shingles.h" +#include "fstring.h" +#include "siphash.h" + +#define SHINGLES_WINDOW 10 + +static void +rspamd_shingles_update_row (rspamd_fstring_t *in, struct siphash *h) +{ + int i; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + sip24_update (&h[i], in->begin, in->len); + } +} + +struct rspamd_shingle* +rspamd_shingles_generate (GArray *input, + const guchar key[16], + rspamd_mempool_t *pool, + rspamd_shingles_filter filter, + gpointer filterd) +{ + struct rspamd_shingle *res; + GArray *hashes[RSPAMD_SHINGLE_SIZE]; + struct sipkey keys[RSPAMD_SHINGLE_SIZE]; + struct siphash h[RSPAMD_SHINGLE_SIZE]; + guchar shabuf[32], *out_key; + const guchar *cur_key; + GChecksum *cksum; + gint i, j, beg = 0; + gsize shalen; + + res = rspamd_mempool_alloc (pool, sizeof (*res)); + cksum = g_checksum_new (G_CHECKSUM_SHA256); + cur_key = key; + out_key = (guchar *)&keys[0]; + + /* Init hashes pipes and keys */ + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64), + SHINGLES_WINDOW * 2); + /* + * To generate a set of hashes we just apply sha256 to the + * initial key as many times as many hashes are required and + * xor left and right parts of sha256 to get a single 16 bytes SIP key. + */ + shalen = sizeof (shabuf); + g_checksum_update (cksum, cur_key, 16); + g_checksum_get_digest (cksum, shabuf, &shalen); + + for (j = 0; j < 16; j ++) { + out_key[j] = shabuf[j] ^ shabuf[sizeof(shabuf) - j - 1]; + } + g_checksum_reset (cksum); + cur_key = out_key; + out_key += 16; + sip24_init (&h[i], &keys[i]); + } + + g_checksum_free (cksum); + + /* Now parse input words into a vector of hashes using rolling window */ + for (i = 0; i < (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len - 1) { + for (j = beg; j <= i; j ++) { + rspamd_shingles_update_row (&g_array_index (input, + rspamd_fstring_t, j), h); + } + + /* Now we need to create a new row here */ + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + guint64 val; + + val = sip24_final (&h[i]); + /* Reinit siphash state */ + sip24_init (&h[i], &keys[i]); + g_array_append_val (hashes[i], val); + } + } + } + + /* Now we need to filter all hashes and make a shingles result */ + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len, + filterd); + g_array_free (hashes[i], TRUE); + } + + return res; +} + + +guint64 +rspamd_shingles_default_filter (guint64 *input, gsize count, + gpointer ud) +{ + guint64 minimal = G_MAXUINT64; + gsize i; + + for (i = 0; i < count; i ++) { + if (minimal > input[i]) { + minimal = input[i]; + } + } + + return minimal; +} diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h new file mode 100644 index 000000000..571199992 --- /dev/null +++ b/src/libutil/shingles.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2014, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SHINGLES_H_ +#define SHINGLES_H_ + +#include "config.h" +#include "mem_pool.h" + +#define RSPAMD_SHINGLE_SIZE 84 + +struct rspamd_shingle { + guint64 hashes[RSPAMD_SHINGLE_SIZE]; +}; + +/** + * Shingles filtering function + * @param input input array of hashes + * @param count number of hashes in the vector + * @return shingle value + */ +typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count, + gpointer ud); + +/** + * Generate shingles from the input of fixed size strings using lemmatizer + * if needed + * @param input array of `rspamd_fstring_t` + * @param key secret key used to generate shingles + * @param pool pool to allocate shigles array + * @param filter hashes filtering function + * @param filterd opaque data for filtering function + * @return shingles array + */ +struct rspamd_shingle* rspamd_shingles_generate (GArray *input, + const guchar key[16], + rspamd_mempool_t *pool, + rspamd_shingles_filter filter, + gpointer filterd); + +/** + * Default filtering function + */ +guint64 rspamd_shingles_default_filter (guint64 *input, gsize count, + gpointer ud); + +#endif /* SHINGLES_H_ */