diff options
-rw-r--r-- | src/fuzzy_storage.c | 51 | ||||
-rw-r--r-- | src/fuzzy_storage.h | 9 | ||||
-rw-r--r-- | src/libmime/message.h | 4 | ||||
-rw-r--r-- | src/libmime/mime_expressions.c | 1 | ||||
-rw-r--r-- | src/libserver/fuzzy_backend.c | 100 | ||||
-rw-r--r-- | src/libserver/protocol.c | 13 | ||||
-rw-r--r-- | src/libutil/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/libutil/fuzzy.c | 557 | ||||
-rw-r--r-- | src/libutil/fuzzy.h | 77 | ||||
-rw-r--r-- | src/lua/lua_mimepart.c | 90 | ||||
-rw-r--r-- | src/plugins/fuzzy_check.c | 57 | ||||
-rw-r--r-- | test/CMakeLists.txt | 1 | ||||
-rw-r--r-- | test/rspamd_fuzzy_test.c | 76 | ||||
-rw-r--r-- | test/rspamd_test_suite.c | 1 | ||||
-rw-r--r-- | test/tests.h | 3 |
15 files changed, 29 insertions, 1012 deletions
diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c index f544c4090..507c99349 100644 --- a/src/fuzzy_storage.c +++ b/src/fuzzy_storage.c @@ -34,7 +34,6 @@ #include "cfg_file.h" #include "url.h" #include "message.h" -#include "fuzzy.h" #include "bloom.h" #include "map.h" #include "fuzzy_storage.h" @@ -80,19 +79,11 @@ struct rspamd_fuzzy_storage_ctx { struct rspamd_fuzzy_backend *backend; }; -struct rspamd_legacy_fuzzy_node { - gint32 value; - gint32 flag; - guint64 time; - rspamd_fuzzy_t h; -}; - struct fuzzy_session { struct rspamd_worker *worker; struct rspamd_fuzzy_cmd *cmd; gint fd; guint64 time; - gboolean legacy; rspamd_inet_addr_t *addr; struct rspamd_fuzzy_storage_ctx *ctx; }; @@ -114,28 +105,9 @@ rspamd_fuzzy_write_reply (struct fuzzy_session *session, struct rspamd_fuzzy_reply *rep) { gint r; - gchar buf[64]; - - if (session->legacy) { - if (rep->prob > 0.5) { - if (session->cmd->cmd == FUZZY_CHECK) { - r = rspamd_snprintf (buf, sizeof (buf), "OK %d %d" CRLF, - rep->value, rep->flag); - } - else { - r = rspamd_snprintf (buf, sizeof (buf), "OK" CRLF); - } - } - else { - r = rspamd_snprintf (buf, sizeof (buf), "ERR" CRLF); - } - r = rspamd_inet_address_sendto (session->fd, buf, r, 0, session->addr); - } - else { - r = rspamd_inet_address_sendto (session->fd, rep, sizeof (*rep), 0, - session->addr); - } + r = rspamd_inet_address_sendto (session->fd, rep, sizeof (*rep), 0, + session->addr); if (r == -1) { if (errno == EINTR) { @@ -240,8 +212,7 @@ accept_fuzzy_socket (gint fd, short what, void *arg) struct fuzzy_session session; gint r; guint8 buf[2048]; - struct rspamd_fuzzy_cmd *cmd = NULL, lcmd; - struct legacy_fuzzy_cmd *l; + struct rspamd_fuzzy_cmd *cmd = NULL; enum rspamd_fuzzy_epoch epoch = RSPAMD_FUZZY_EPOCH_MAX; session.worker = worker; @@ -262,22 +233,8 @@ accept_fuzzy_socket (gint fd, short what, void *arg) return; } - if ((guint)r == sizeof (struct legacy_fuzzy_cmd)) { - session.legacy = TRUE; - l = (struct legacy_fuzzy_cmd *)buf; - lcmd.version = 2; - memcpy (lcmd.digest, l->hash, sizeof (lcmd.digest)); - lcmd.cmd = l->cmd; - lcmd.flag = l->flag; - lcmd.shingles_count = 0; - lcmd.value = l->value; - lcmd.tag = 0; - cmd = &lcmd; - epoch = RSPAMD_FUZZY_EPOCH6; - } - else if ((guint)r >= sizeof (struct rspamd_fuzzy_cmd)) { + if ((guint)r >= sizeof (struct rspamd_fuzzy_cmd)) { /* Check shingles count sanity */ - session.legacy = FALSE; cmd = (struct rspamd_fuzzy_cmd *)buf; epoch = rspamd_fuzzy_command_valid (cmd, r); if (epoch == RSPAMD_FUZZY_EPOCH_MAX) { diff --git a/src/fuzzy_storage.h b/src/fuzzy_storage.h index e2803c52e..b9997da8b 100644 --- a/src/fuzzy_storage.h +++ b/src/fuzzy_storage.h @@ -3,7 +3,6 @@ #include "config.h" #include "main.h" -#include "fuzzy.h" #include "shingles.h" #define RSPAMD_FUZZY_VERSION 3 @@ -13,14 +12,6 @@ #define FUZZY_WRITE 1 #define FUZZY_DEL 2 -struct legacy_fuzzy_cmd { - u_char cmd; - guint32 blocksize; - gint32 value; - gint32 flag; - u_char hash[FUZZY_HASHLEN]; -}; - RSPAMD_PACKED(rspamd_fuzzy_cmd) { guint8 version; guint8 cmd; diff --git a/src/libmime/message.h b/src/libmime/message.h index 04e7cd5f3..b509b23cd 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -7,7 +7,6 @@ #define RSPAMD_MESSAGE_H #include "config.h" -#include "fuzzy.h" struct rspamd_task; struct controller_session; @@ -43,11 +42,8 @@ struct mime_text_part { GByteArray *content; struct html_content *html; GList *urls_offset; /**< list of offsets of urls */ - rspamd_fuzzy_t *fuzzy; - rspamd_fuzzy_t *double_fuzzy; GMimeObject *parent; struct mime_part *mime_part; - rspamd_fstring_t *diff_str; GArray *words; GArray *normalized_words; guint nlines; diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index bff70c1b7..a4c02989e 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -27,7 +27,6 @@ #include "cfg_file.h" #include "main.h" #include "message.h" -#include "fuzzy.h" #include "mime_expressions.h" #include "html.h" #include "lua/lua_common.h" diff --git a/src/libserver/fuzzy_backend.c b/src/libserver/fuzzy_backend.c index 7cd4faa91..a5bf28c7c 100644 --- a/src/libserver/fuzzy_backend.c +++ b/src/libserver/fuzzy_backend.c @@ -24,20 +24,9 @@ #include "config.h" #include "main.h" #include "fuzzy_backend.h" -#include "fuzzy_storage.h" #include <sqlite3.h> -/* Magic sequence for hashes file */ -#define FUZZY_FILE_MAGIC "rsh" - -struct rspamd_legacy_fuzzy_node { - gint32 value; - gint32 flag; - guint64 time; - rspamd_fuzzy_t h; -}; - struct rspamd_fuzzy_backend { sqlite3 *db; char *path; @@ -45,7 +34,6 @@ struct rspamd_fuzzy_backend { gsize expired; }; - static const char *create_tables_sql = "BEGIN;" "CREATE TABLE digests(" @@ -393,80 +381,11 @@ rspamd_fuzzy_backend_open_db (const gchar *path, GError **err) return bk; } -/* - * Convert old database to the new format - */ -static gboolean -rspamd_fuzzy_backend_convert (const gchar *path, int fd, GError **err) -{ - gchar tmpdb[PATH_MAX]; - struct rspamd_fuzzy_backend *nbackend; - struct stat st; - gint off; - guint8 *map, *p, *end; - struct rspamd_legacy_fuzzy_node *n; - - rspamd_snprintf (tmpdb, sizeof (tmpdb), "%s.converted", path); - (void)unlink (tmpdb); - nbackend = rspamd_fuzzy_backend_create_db (tmpdb, FALSE, err); - - if (nbackend == NULL) { - return FALSE; - } - - (void)fstat (fd, &st); - (void)lseek (fd, 0, SEEK_SET); - - off = sizeof (FUZZY_FILE_MAGIC); - if (off >= st.st_size) { - msg_warn ("old fuzzy storage is empty or corrupted, remove it"); - } - else { - if ((map = mmap (NULL, st.st_size - off, PROT_READ, MAP_SHARED, fd, - 0)) == MAP_FAILED) { - g_set_error (err, rspamd_fuzzy_backend_quark (), - errno, "Cannot mmap file %s: %s", - path, strerror (errno)); - rspamd_fuzzy_backend_close (nbackend); - - return FALSE; - } - - end = map + st.st_size; - p = map + off; - - rspamd_fuzzy_backend_run_simple (RSPAMD_FUZZY_BACKEND_TRANSACTION_START, - nbackend, NULL); - while (p < end) { - n = (struct rspamd_legacy_fuzzy_node *)p; - /* Convert node flag, digest, value, time */ - if (rspamd_fuzzy_backend_run_stmt (nbackend, RSPAMD_FUZZY_BACKEND_INSERT, - (gint)n->flag, n->h.hash_pipe, - (gint64)n->value, n->time) != SQLITE_OK) { - msg_warn ("Cannot execute init sql %s: %s", - prepared_stmts[RSPAMD_FUZZY_BACKEND_INSERT].sql, - sqlite3_errmsg (nbackend->db)); - } - p += sizeof (struct rspamd_legacy_fuzzy_node); - } - - munmap (map, st.st_size); - rspamd_fuzzy_backend_run_simple (RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT, - nbackend, NULL); - } - - rspamd_fuzzy_backend_run_sql (create_index_sql, nbackend, NULL); - rspamd_fuzzy_backend_close (nbackend); - rename (tmpdb, path); - - return TRUE; -} - struct rspamd_fuzzy_backend* rspamd_fuzzy_backend_open (const gchar *path, GError **err) { - gchar *dir, header[4]; - gint fd, r; + gchar *dir; + gint fd; struct rspamd_fuzzy_backend *res; static const char sqlite_wal[] = "PRAGMA journal_mode=\"wal\";", fallback_journal[] = "PRAGMA journal_mode=\"off\";"; @@ -501,21 +420,6 @@ rspamd_fuzzy_backend_open (const gchar *path, GError **err) return NULL; } } - else { - - /* Check for legacy format */ - if ((r = read (fd, header, sizeof (header))) == sizeof (header)) { - if (memcmp (header, FUZZY_FILE_MAGIC, sizeof (header) - 1) == 0) { - msg_info ("Trying to convert old fuzzy database"); - if (!rspamd_fuzzy_backend_convert (path, fd, err)) { - close (fd); - return NULL; - } - msg_info ("Old database converted"); - } - close (fd); - } - } close (fd); diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index a4d78427f..1fedbbb46 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -645,20 +645,9 @@ write_hashes_to_log (struct rspamd_task *task, GString *logbuf) struct mime_text_part *text_part; guint i; + /* TODO: rework parts hashes */ for (i = 0; i < task->text_parts->len; i ++) { text_part = g_ptr_array_index (task->text_parts, i); - - if (text_part->fuzzy) { - if (i != task->text_parts->len - 1) { - rspamd_printf_gstring (logbuf, - " part: %Xd,", - text_part->fuzzy->h); - } - else { - rspamd_printf_gstring (logbuf, " part: %Xd", - text_part->fuzzy->h); - } - } } } diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt index 61e5d6d15..338a74027 100644 --- a/src/libutil/CMakeLists.txt +++ b/src/libutil/CMakeLists.txt @@ -6,7 +6,6 @@ SET(LIBRSPAMDUTILSRC ${CMAKE_CURRENT_SOURCE_DIR}/diff.c ${CMAKE_CURRENT_SOURCE_DIR}/expression.c ${CMAKE_CURRENT_SOURCE_DIR}/fstring.c - ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy.c ${CMAKE_CURRENT_SOURCE_DIR}/hash.c ${CMAKE_CURRENT_SOURCE_DIR}/http.c ${CMAKE_CURRENT_SOURCE_DIR}/keypairs_cache.c diff --git a/src/libutil/fuzzy.c b/src/libutil/fuzzy.c deleted file mode 100644 index 218065b77..000000000 --- a/src/libutil/fuzzy.c +++ /dev/null @@ -1,557 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -#include "config.h" -#include "mem_pool.h" -#include "fstring.h" -#include "fuzzy.h" -#include "message.h" -#include "url.h" -#include "main.h" -#include "xxhash.h" - -#define ROLL_WINDOW_SIZE 9 -#define MIN_FUZZY_BLOCK_SIZE 3 -#define HASH_INIT 0x28021967 - -static const char *b64 = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - -struct roll_state { - guint32 h[3]; - gchar window[ROLL_WINDOW_SIZE]; - gint n; -}; - -static struct roll_state rs; - - -/* Rolling hash function based on Adler-32 checksum */ -static guint32 -fuzzy_roll_hash (guint c) -{ - /* Check window position */ - if (rs.n == ROLL_WINDOW_SIZE) { - rs.n = 0; - } - - rs.h[1] -= rs.h[0]; - rs.h[1] += ROLL_WINDOW_SIZE * c; - - rs.h[0] += c; - rs.h[0] -= rs.window[rs.n]; - - /* Save current symbol */ - rs.window[rs.n] = c; - rs.n++; - - rs.h[2] <<= 5; - rs.h[2] ^= c; - - return rs.h[0] + rs.h[1] + rs.h[2]; -} - -/* A simple non-rolling hash, based on the FNV hash */ -static guint32 -fuzzy_fnv_hash (guint c, guint32 hval) -{ - hval ^= c; - hval += - (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); - return hval; -} - -/* Calculate blocksize depending on length of input */ -static guint32 -fuzzy_blocksize (guint32 len) -{ - guint32 nlen = MIN_FUZZY_BLOCK_SIZE; - - while (nlen * (FUZZY_HASHLEN - 1) < len) { - nlen *= 2; - } - return nlen; -} - - -/* Update hash with new symbol */ -static void -fuzzy_update (rspamd_fuzzy_t * h, guint c) -{ - h->rh = fuzzy_roll_hash (c); - h->h = fuzzy_fnv_hash (c, h->h); - - if (h->rh % h->block_size == (h->block_size - 1)) { - h->hash_pipe[h->hi] = b64[h->h % 64]; - if (h->hi < FUZZY_HASHLEN - 2) { - h->h = HASH_INIT; - h->hi++; - } - } -} - -static void -fuzzy_update2 (rspamd_fuzzy_t * h1, rspamd_fuzzy_t *h2, guint c) -{ - h1->rh = fuzzy_roll_hash (c); - h1->h = fuzzy_fnv_hash (c, h1->h); - h2->rh = h1->rh; - h2->h = fuzzy_fnv_hash (c, h2->h); - - if (h1->rh % h1->block_size == (h1->block_size - 1)) { - h1->hash_pipe[h1->hi] = b64[h1->h % 64]; - if (h1->hi < FUZZY_HASHLEN - 2) { - h1->h = HASH_INIT; - h1->hi++; - } - } - if (h2->rh % h2->block_size == (h2->block_size - 1)) { - h2->hash_pipe[h2->hi] = b64[h2->h % 64]; - if (h2->hi < FUZZY_HASHLEN - 2) { - h2->h = HASH_INIT; - h2->hi++; - } - } -} - -/* - * Levenshtein distance between string1 and string2. - * - * Replace cost is normally 1, and 2 with nonzero xcost. - */ -guint32 -rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2) -{ - gint i; - gint *row; /* we only need to keep one row of costs */ - gint *end; - gint half, nx; - gchar *sx, *char2p, char1; - gint *p, D, x, offset, c3; - - /* strip common prefix */ - while (len1 > 0 && len2 > 0 && *s1 == *s2) { - len1--; - len2--; - s1++; - s2++; - } - - /* strip common suffix */ - while (len1 > 0 && len2 > 0 && s1[len1 - 1] == s2[len2 - 1]) { - len1--; - len2--; - } - - /* catch trivial cases */ - if (len1 == 0) { - return len2; - } - - if (len2 == 0) { - return len1; - } - - /* make the inner cycle (i.e. string2) the longer one */ - if (len1 > len2) { - nx = len1; - sx = s1; - len1 = len2; - len2 = nx; - s1 = s2; - s2 = sx; - } - /* check len1 == 1 separately */ - if (len1 == 1) { - return len2 - (memchr (s2, *s1, len2) != NULL); - } - - len1++; - len2++; - half = len1 >> 1; - - /* initalize first row */ - row = g_malloc (len2 * sizeof (gint)); - end = row + len2 - 1; - for (i = 0; i < len2; i++) { - row[i] = i; - } - - /* in this case we don't have to scan two corner triangles (of size len1/2) - * in the matrix because no best path can go throught them. note this - * breaks when len1 == len2 == 2 so the memchr() special case above is - * necessary */ - row[0] = len1 - half - 1; - for (i = 1; i < len1; i++) { - char1 = s1[i - 1]; - /* skip the upper triangle */ - if (i >= len1 - half) { - offset = i - (len1 - half); - char2p = s2 + offset; - p = row + offset; - c3 = *(p++) + (char1 != *(char2p++)); - x = *p; - x++; - D = x; - if (x > c3) - x = c3; - *(p++) = x; - } - else { - p = row + 1; - char2p = s2; - D = x = i; - } - /* skip the lower triangle */ - if (i <= half + 1) - end = row + len2 + i - half - 2; - /* main */ - while (p <= end) { - c3 = --D + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - D = *p; - D++; - if (x > D) - x = D; - *(p++) = x; - } - /* lower triangle sentinel */ - if (i <= half) { - c3 = --D + (char1 != *char2p); - x++; - if (x > c3) - x = c3; - *p = x; - } - } - - i = *end; - g_free (row); - return i; -} - -/* Calculate fuzzy hash for specified string */ -rspamd_fuzzy_t * -rspamd_fuzzy_init (rspamd_fstring_t * in, rspamd_mempool_t * pool) -{ - rspamd_fuzzy_t *new; - guint i, repeats = 0; - gchar *c = in->begin, last = '\0'; - gsize real_len = 0; - - new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t)); - bzero (&rs, sizeof (rs)); - for (i = 0; i < in->len; i++) { - if (*c == last) { - repeats++; - } - else { - repeats = 0; - } - if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { - real_len++; - } - last = *c; - c++; - } - - new->block_size = fuzzy_blocksize (real_len); - c = in->begin; - - for (i = 0; i < in->len; i++) { - if (*c == last) { - repeats++; - } - else { - repeats = 0; - } - if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { - fuzzy_update (new, *c); - } - last = *c; - c++; - } - - /* Check whether we have more bytes in a rolling window */ - if (new->rh != 0) { - new->hash_pipe[new->hi] = b64[new->h % 64]; - } - - return new; -} - -rspamd_fuzzy_t * -rspamd_fuzzy_from_byte_array (GByteArray * in, rspamd_mempool_t * pool) -{ - rspamd_fstring_t f; - - f.begin = (gchar *)in->data; - f.len = in->len; - - return rspamd_fuzzy_init (&f, pool); -} - -void -rspamd_fuzzy_from_text_part (struct mime_text_part *part, - rspamd_mempool_t *pool, - gsize max_diff) -{ - rspamd_fuzzy_t *new, *new2; - gchar *c, *end, *begin, *p; - gsize real_len = 0, len = part->content->len; - GList *cur_offset; - struct process_exception *cur_ex = NULL; - gunichar uc; - gboolean write_diff = FALSE; - - cur_offset = part->urls_offset; - if (cur_offset != NULL) { - cur_ex = cur_offset->data; - } - - begin = (gchar *)part->content->data; - c = begin; - new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t)); - new2 = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t)); - bzero (&rs, sizeof (rs)); - end = c + len; - - if (IS_PART_UTF (part)) { - while (c < end) { - if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { - c += cur_ex->len + 1; - cur_offset = g_list_next (cur_offset); - if (cur_offset != NULL) { - cur_ex = cur_offset->data; - } - } - else { - uc = g_utf8_get_char (c); - if (g_unichar_isalnum (uc)) { - p = g_utf8_next_char (c); - real_len += p - c; - } - else { - p = g_utf8_next_char (c); - } - c = p; - } - } - } - else { - while (c < end) { - if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { - c += cur_ex->len + 1; - cur_offset = g_list_next (cur_offset); - if (cur_offset != NULL) { - cur_ex = cur_offset->data; - } - } - else { - if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { - real_len++; - } - c++; - } - } - } - - write_diff = real_len > 0 && real_len < max_diff; - - if (write_diff) { - part->diff_str = rspamd_fstralloc (pool, real_len + 1); - } - else { - part->diff_str = NULL; - } - - new->block_size = fuzzy_blocksize (real_len); - new2->block_size = new->block_size * 2; - - cur_offset = part->urls_offset; - if (cur_offset != NULL) { - cur_ex = cur_offset->data; - } - - begin = (gchar *)part->content->data; - c = begin; - end = c + len; - if (IS_PART_UTF (part)) { - - while (c < end) { - if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { - c += cur_ex->len + 1; - cur_offset = g_list_next (cur_offset); - if (cur_offset != NULL) { - cur_ex = cur_offset->data; - } - } - else { - uc = g_utf8_get_char (c); - if (g_unichar_isalnum (uc)) { - fuzzy_update2 (new, new2, uc); - if (write_diff) { - rspamd_fstrappend_u (part->diff_str, uc); - } - } - c = g_utf8_next_char (c); - } - } - } - else { - while (c < end) { - if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { - c += cur_ex->len + 1; - cur_offset = g_list_next (cur_offset); - if (cur_offset != NULL) { - cur_ex = cur_offset->data; - } - } - else { - if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { - fuzzy_update2 (new, new2, *c); - if (write_diff) { - rspamd_fstrappend_c (part->diff_str, *c); - } - } - c++; - } - } - } - - /* Check whether we have more bytes in a rolling window */ - if (new->rh != 0) { - new->hash_pipe[new->hi] = b64[new->h % 64]; - } - if (new2->rh != 0) { - new2->hash_pipe[new2->hi] = b64[new2->h % 64]; - } - - part->fuzzy = new; - part->double_fuzzy = new2; -} - -/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ -gint -rspamd_fuzzy_compare (rspamd_fuzzy_t * h1, rspamd_fuzzy_t * h2) -{ - gint res, l1, l2; - - /* If we have hashes of different size, input strings are too different */ - if (h1->block_size != h2->block_size) { - return 0; - } - - l1 = strlen (h1->hash_pipe); - l2 = strlen (h2->hash_pipe); - - if (l1 == 0 || l2 == 0) { - if (l1 == 0 && l2 == 0) { - return 100; - } - else { - return 0; - } - } - - res = rspamd_levinstein_distance (h1->hash_pipe, l1, h2->hash_pipe, l2); - res = 100 - (2 * res * 100) / (l1 + l2); - - return res; -} - -gint -rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2) -{ - if (p1->fuzzy != NULL && p2->fuzzy != NULL) { - if (p1->fuzzy->block_size == p2->fuzzy->block_size) { - return rspamd_fuzzy_compare (p1->fuzzy, p2->fuzzy); - } - else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) { - return rspamd_fuzzy_compare (p1->double_fuzzy, p2->fuzzy); - } - else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) { - return rspamd_fuzzy_compare (p2->double_fuzzy, p1->fuzzy); - } - } - - return 0; -} - -gint -rspamd_fuzzy_len (rspamd_fuzzy_t *h) -{ - gint len; - void *nullpos; - - nullpos = memchr (h->hash_pipe, '\0', sizeof (h->hash_pipe)); - - if (nullpos == NULL) { - len = sizeof (h->hash_pipe); - } - else { - len = (char *)nullpos - h->hash_pipe; - } - - return len; -} - -guint -rspamd_fuzzy_hash (gconstpointer key) -{ - rspamd_fuzzy_t *fh = (rspamd_fuzzy_t *)key; - XXH64_state_t xxh; - - XXH64_reset (&xxh, rspamd_hash_seed ()); - - XXH64_update (&xxh, &fh->block_size, sizeof (fh->block_size)); - XXH64_update (&xxh, fh->hash_pipe, rspamd_fuzzy_len (fh)); - - return XXH64_digest (&xxh); -} - -gboolean -rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2) -{ - rspamd_fuzzy_t *fh1= (rspamd_fuzzy_t *)v1, - *fh2 = (rspamd_fuzzy_t *)v2; - - if (fh1->block_size == fh2->block_size) { - gint l1 = rspamd_fuzzy_len (fh1), - l2 = rspamd_fuzzy_len (fh2); - - if (l1 == l2) { - return (memcmp (fh1->hash_pipe, fh2->hash_pipe, l1) == 0); - } - } - - return FALSE; -} - -/* - * vi:ts=4 - */ diff --git a/src/libutil/fuzzy.h b/src/libutil/fuzzy.h deleted file mode 100644 index 813599c6b..000000000 --- a/src/libutil/fuzzy.h +++ /dev/null @@ -1,77 +0,0 @@ -/** - * @file fuzzy.h - * Fuzzy hashes API - */ - -#ifndef RSPAMD_FUZZY_H -#define RSPAMD_FUZZY_H - -#include "config.h" -#include "mem_pool.h" -#include "fstring.h" - -#define FUZZY_HASHLEN 64 - -typedef struct fuzzy_hash_s { - gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */ - guint32 block_size; /**< current blocksize */ - guint32 rh; /**< roll hash value */ - guint32 h; /**< hash of block */ - guint32 hi; /**< current index in hash pipe */ -} rspamd_fuzzy_t; - -struct mime_text_part; - -/** - * Calculate fuzzy hash for specified string - * @param in input string - * @param pool pool object - * @return fuzzy_hash object allocated in pool - */ -rspamd_fuzzy_t * rspamd_fuzzy_init (rspamd_fstring_t *in, rspamd_mempool_t *pool); -/** - * Calculate fuzzy hash for specified byte array - * @param in input string - * @param pool pool object - * @return fuzzy_hash object allocated in pool - */ -rspamd_fuzzy_t * rspamd_fuzzy_from_byte_array (GByteArray *in, rspamd_mempool_t *pool); - -/** - * Calculate fuzzy hash for specified text part - * @param part text part object - * @param pool pool object - * @param max_diff maximum text length to use diff algorithm in comparasions - * @return fuzzy_hash object allocated in pool - */ -void rspamd_fuzzy_from_text_part (struct mime_text_part *part, - rspamd_mempool_t *pool, - gsize max_diff); - -/** - * Compare score of difference between two hashes - * @param h1 first hash - * @param h2 second hash - * @return result in percents 0 - different hashes, 100 - identical hashes - */ -gint rspamd_fuzzy_compare (rspamd_fuzzy_t *h1, rspamd_fuzzy_t *h2); - -/* - * Compare two text parts and return percents of difference - */ -gint rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2); - -/* - * Calculate levenstein distance between two strings. Note: this algorithm should be used - * only for short texts - it runs too slow on long ones. - */ -guint32 rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2); - -/* - * Hash table utilities - */ -gint rspamd_fuzzy_len (rspamd_fuzzy_t *h); -guint rspamd_fuzzy_hash (gconstpointer key); -gboolean rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2); - -#endif diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 085295b18..323ebfd32 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -86,12 +86,6 @@ LUA_FUNCTION_DEF (textpart, is_empty); */ LUA_FUNCTION_DEF (textpart, is_html); /*** - * @method text_part:get_fuzzy() - * Returns base32 encoded value of fuzzy hash of the specified part - * @return {string} fuzzy hash value - */ -LUA_FUNCTION_DEF (textpart, get_fuzzy); -/*** * @method text_part:get_language() * Returns the code of the most used unicode script in the text part. Does not work with raw parts * @return {string} short abbreviation (such as `ru`) for the script's language @@ -103,16 +97,6 @@ LUA_FUNCTION_DEF (textpart, get_language); * @return {mimepart} mimepart object */ LUA_FUNCTION_DEF (textpart, get_mimepart); -/*** - * @method text_part:compare_distance(other) - * Calculates the difference to another text part. This function is intended to work with - * the parts of `multipart/alternative` container only. If the two parts are not the parts of the - * same `multipart/alternative` container, then they are considered as unrelated and - * `-1` is returned. - * @param {text_part} other text part to compare - * @return {integer} commodity percentage (e.g. the same strings give `100`, different give `0` and unrelated give `-1`) - */ -LUA_FUNCTION_DEF (textpart, compare_distance); static const struct luaL_reg textpartlib_m[] = { LUA_INTERFACE_DEF (textpart, is_utf), @@ -121,10 +105,8 @@ static const struct luaL_reg textpartlib_m[] = { LUA_INTERFACE_DEF (textpart, get_lines_count), LUA_INTERFACE_DEF (textpart, is_empty), LUA_INTERFACE_DEF (textpart, is_html), - LUA_INTERFACE_DEF (textpart, get_fuzzy), LUA_INTERFACE_DEF (textpart, get_language), LUA_INTERFACE_DEF (textpart, get_mimepart), - LUA_INTERFACE_DEF (textpart, compare_distance), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL} }; @@ -353,24 +335,6 @@ lua_textpart_is_html (lua_State * L) return 1; } -static gint -lua_textpart_get_fuzzy (lua_State * L) -{ - struct mime_text_part *part = lua_check_textpart (L); - gchar *out; - - if (part == NULL || IS_PART_EMPTY (part)) { - lua_pushnil (L); - return 1; - } - - out = rspamd_encode_base32 (part->fuzzy->hash_pipe, - strlen (part->fuzzy->hash_pipe)); - lua_pushstring (L, out); - g_free (out); - - return 1; -} static gint lua_textpart_get_language (lua_State * L) @@ -408,60 +372,6 @@ lua_textpart_get_mimepart (lua_State * L) return 1; } -static gint -lua_textpart_compare_distance (lua_State * L) -{ - struct mime_text_part *part = lua_check_textpart (L), *other; - void *ud = luaL_checkudata (L, 2, "rspamd{textpart}"); - gint diff = -1; - GMimeObject *parent; - const GMimeContentType *ct; - - luaL_argcheck (L, ud != NULL, 2, "'textpart' expected"); - other = ud ? *((struct mime_text_part **)ud) : NULL; - - if (other != NULL && part->parent && part->parent == other->parent) { - parent = part->parent; - ct = g_mime_object_get_content_type (parent); -#ifndef GMIME24 - if (ct == NULL || - !g_mime_content_type_is_type (ct, "multipart", "alternative")) { -#else - if (ct == NULL || - !g_mime_content_type_is_type ((GMimeContentType *)ct, "multipart", - "alternative")) { -#endif - diff = -1; - - } - else { - if (!IS_PART_EMPTY (part) && !IS_PART_EMPTY (other)) { - if (part->diff_str != NULL && other->diff_str != NULL) { - diff = rspamd_diff_distance (part->diff_str, - other->diff_str); - } - else { - diff = rspamd_fuzzy_compare_parts (part, other); - } - } - else if ((IS_PART_EMPTY (part) && - !IS_PART_EMPTY (other)) || (!IS_PART_EMPTY (part) && - IS_PART_EMPTY (other))) { - /* Empty and non empty parts are different */ - diff = 0; - } - } - } - else { - diff = -1; - } - - - lua_pushinteger (L, diff); - - return 1; -} - /* Mimepart implementation */ static gint diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index ec849da54..857033ec0 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -563,46 +563,34 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, rspamd_fstring_t *word; GArray *words; - if (legacy || part->words == NULL || part->words->len == 0) { - cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd)); + shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd)); - cmd->shingles_count = 0; - rspamd_strlcpy (cmd->digest, part->fuzzy->hash_pipe, sizeof (cmd->digest)); + /* + * Generate hash from all words in the part + */ + g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str, + rule->hash_key->len) != -1); + words = fuzzy_preprocess_words (part, pool); - if (size != NULL) { - *size = sizeof (struct rspamd_fuzzy_cmd); - } + for (i = 0; i < words->len; i ++) { + word = &g_array_index (words, rspamd_fstring_t, i); + blake2b_update (&st, word->begin, word->len); } - else { - shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd)); - - /* - * Generate hash from all words in the part - */ - g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str, - rule->hash_key->len) != -1); - words = fuzzy_preprocess_words (part, pool); + blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest)); - for (i = 0; i < words->len; i ++) { - word = &g_array_index (words, rspamd_fstring_t, i); - blake2b_update (&st, word->begin, word->len); - } - blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest)); - - msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str); - sh = rspamd_shingles_generate (words, - rule->shingles_key->str, pool, - rspamd_shingles_default_filter, NULL); - if (sh != NULL) { - memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); - shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; - } + msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str); + sh = rspamd_shingles_generate (words, + rule->shingles_key->str, pool, + rspamd_shingles_default_filter, NULL); + if (sh != NULL) { + memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); + shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; + } - cmd = (struct rspamd_fuzzy_cmd *)shcmd; + cmd = (struct rspamd_fuzzy_cmd *)shcmd; - if (size != NULL) { - *size = sizeof (struct rspamd_fuzzy_shingle_cmd); - } + if (size != NULL) { + *size = sizeof (struct rspamd_fuzzy_shingle_cmd); } cmd->tag = ottery_rand_uint32 (); @@ -959,7 +947,6 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, struct mime_part *mime_part; struct rspamd_image *image; struct rspamd_fuzzy_cmd *cmd; - gsize hashlen; guint i; GPtrArray *res; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7c3312634..584bbbd4d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,6 +1,5 @@ SET(TESTSRC rspamd_mem_pool_test.c rspamd_statfile_test.c - rspamd_fuzzy_test.c rspamd_url_test.c rspamd_dns_test.c rspamd_async_test.c diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c deleted file mode 100644 index b1f1f5dcd..000000000 --- a/test/rspamd_fuzzy_test.c +++ /dev/null @@ -1,76 +0,0 @@ -#include "config.h" -#include "main.h" -#include "fuzzy.h" -#include "tests.h" - -static char *s1 = "This is sample test text.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n"; -static char *s2 = "This is sample test text.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopzrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n"; -static char *s3 = ""; -static char *s4 = "abcdefghijklmn\r\n"; -static char *s5 = "This is sample test text.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopzrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n" - "abcdefghijklmnopqrstuvwx.\r\n"; - -void -rspamd_fuzzy_test_func () -{ - rspamd_mempool_t *pool; - rspamd_fuzzy_t *h1, *h2, *h3, *h4, *h5; - rspamd_fstring_t f1, f2, f3, f4, f5; - int diff2; - - pool = rspamd_mempool_new (1024); - f1.begin = s1; - f1.len = strlen (s1); - f2.begin = s2; - f2.len = strlen (s2); - f3.begin = s3; - f3.len = strlen (s3); - f4.begin = s4; - f4.len = strlen (s4); - f5.begin = s5; - f5.len = strlen (s5); - - h1 = rspamd_fuzzy_init (&f1, pool); - h2 = rspamd_fuzzy_init (&f2, pool); - h3 = rspamd_fuzzy_init (&f3, pool); - h4 = rspamd_fuzzy_init (&f4, pool); - h5 = rspamd_fuzzy_init (&f5, pool); - - diff2 = rspamd_fuzzy_compare (h2, h5); - msg_debug ("rspamd_fuzzy_test_func: s1, s2 difference between strings is %d", rspamd_fuzzy_compare (h1, h2)); - msg_debug ("rspamd_fuzzy_test_func: s1, s3 difference between strings is %d", rspamd_fuzzy_compare (h1, h3)); - msg_debug ("rspamd_fuzzy_test_func: s3, s4 difference between strings is %d", rspamd_fuzzy_compare (h3, h4)); - msg_debug ("rspamd_fuzzy_test_func: s2, s4 difference between strings is %d", rspamd_fuzzy_compare (h2, h4)); - msg_debug ("rspamd_fuzzy_test_func: s2, s5 difference between strings is %d", diff2); - - /* Identical strings */ - if (diff2 != 100) { - msg_err ("hash difference is %d", diff2); - g_assert (diff2 == 100); - } - - rspamd_mempool_delete (pool); -} diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c index 5dc854560..c1a2e27f5 100644 --- a/test/rspamd_test_suite.c +++ b/test/rspamd_test_suite.c @@ -45,7 +45,6 @@ main (int argc, char **argv) g_log_set_default_handler (rspamd_glib_log_function, rspamd_main->logger); g_test_add_func ("/rspamd/mem_pool", rspamd_mem_pool_test_func); - g_test_add_func ("/rspamd/fuzzy", rspamd_fuzzy_test_func); g_test_add_func ("/rspamd/url", rspamd_url_test_func); g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func); g_test_add_func ("/rspamd/radix", rspamd_radix_test_func); diff --git a/test/tests.h b/test/tests.h index f0dab9c02..a2ba05b84 100644 --- a/test/tests.h +++ b/test/tests.h @@ -11,9 +11,6 @@ void rspamd_url_test_func (void); /* Memory pools */ void rspamd_mem_pool_test_func (void); -/* Fuzzy hashes */ -void rspamd_fuzzy_test_func (void); - /* Stat file */ void rspamd_statfile_test_func (void); |