summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-18 00:10:56 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-18 00:10:56 +0100
commitab15b9a3c95d6c0d37330c96d8827ac59b2fee78 (patch)
tree70a9faf4dc79456cfa4aa804a5ce3dbe5c695024
parentaf127078a26a41e6254d97f760c2afcfea2110ef (diff)
downloadrspamd-ab15b9a3c95d6c0d37330c96d8827ac59b2fee78.tar.gz
rspamd-ab15b9a3c95d6c0d37330c96d8827ac59b2fee78.zip
Remove legacy fuzzy code completely.
-rw-r--r--src/fuzzy_storage.c51
-rw-r--r--src/fuzzy_storage.h9
-rw-r--r--src/libmime/message.h4
-rw-r--r--src/libmime/mime_expressions.c1
-rw-r--r--src/libserver/fuzzy_backend.c100
-rw-r--r--src/libserver/protocol.c13
-rw-r--r--src/libutil/CMakeLists.txt1
-rw-r--r--src/libutil/fuzzy.c557
-rw-r--r--src/libutil/fuzzy.h77
-rw-r--r--src/lua/lua_mimepart.c90
-rw-r--r--src/plugins/fuzzy_check.c57
-rw-r--r--test/CMakeLists.txt1
-rw-r--r--test/rspamd_fuzzy_test.c76
-rw-r--r--test/rspamd_test_suite.c1
-rw-r--r--test/tests.h3
15 files changed, 29 insertions, 1012 deletions
diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c
index f544c4090..507c99349 100644
--- a/src/fuzzy_storage.c
+++ b/src/fuzzy_storage.c
@@ -34,7 +34,6 @@
#include "cfg_file.h"
#include "url.h"
#include "message.h"
-#include "fuzzy.h"
#include "bloom.h"
#include "map.h"
#include "fuzzy_storage.h"
@@ -80,19 +79,11 @@ struct rspamd_fuzzy_storage_ctx {
struct rspamd_fuzzy_backend *backend;
};
-struct rspamd_legacy_fuzzy_node {
- gint32 value;
- gint32 flag;
- guint64 time;
- rspamd_fuzzy_t h;
-};
-
struct fuzzy_session {
struct rspamd_worker *worker;
struct rspamd_fuzzy_cmd *cmd;
gint fd;
guint64 time;
- gboolean legacy;
rspamd_inet_addr_t *addr;
struct rspamd_fuzzy_storage_ctx *ctx;
};
@@ -114,28 +105,9 @@ rspamd_fuzzy_write_reply (struct fuzzy_session *session,
struct rspamd_fuzzy_reply *rep)
{
gint r;
- gchar buf[64];
-
- if (session->legacy) {
- if (rep->prob > 0.5) {
- if (session->cmd->cmd == FUZZY_CHECK) {
- r = rspamd_snprintf (buf, sizeof (buf), "OK %d %d" CRLF,
- rep->value, rep->flag);
- }
- else {
- r = rspamd_snprintf (buf, sizeof (buf), "OK" CRLF);
- }
- }
- else {
- r = rspamd_snprintf (buf, sizeof (buf), "ERR" CRLF);
- }
- r = rspamd_inet_address_sendto (session->fd, buf, r, 0, session->addr);
- }
- else {
- r = rspamd_inet_address_sendto (session->fd, rep, sizeof (*rep), 0,
- session->addr);
- }
+ r = rspamd_inet_address_sendto (session->fd, rep, sizeof (*rep), 0,
+ session->addr);
if (r == -1) {
if (errno == EINTR) {
@@ -240,8 +212,7 @@ accept_fuzzy_socket (gint fd, short what, void *arg)
struct fuzzy_session session;
gint r;
guint8 buf[2048];
- struct rspamd_fuzzy_cmd *cmd = NULL, lcmd;
- struct legacy_fuzzy_cmd *l;
+ struct rspamd_fuzzy_cmd *cmd = NULL;
enum rspamd_fuzzy_epoch epoch = RSPAMD_FUZZY_EPOCH_MAX;
session.worker = worker;
@@ -262,22 +233,8 @@ accept_fuzzy_socket (gint fd, short what, void *arg)
return;
}
- if ((guint)r == sizeof (struct legacy_fuzzy_cmd)) {
- session.legacy = TRUE;
- l = (struct legacy_fuzzy_cmd *)buf;
- lcmd.version = 2;
- memcpy (lcmd.digest, l->hash, sizeof (lcmd.digest));
- lcmd.cmd = l->cmd;
- lcmd.flag = l->flag;
- lcmd.shingles_count = 0;
- lcmd.value = l->value;
- lcmd.tag = 0;
- cmd = &lcmd;
- epoch = RSPAMD_FUZZY_EPOCH6;
- }
- else if ((guint)r >= sizeof (struct rspamd_fuzzy_cmd)) {
+ if ((guint)r >= sizeof (struct rspamd_fuzzy_cmd)) {
/* Check shingles count sanity */
- session.legacy = FALSE;
cmd = (struct rspamd_fuzzy_cmd *)buf;
epoch = rspamd_fuzzy_command_valid (cmd, r);
if (epoch == RSPAMD_FUZZY_EPOCH_MAX) {
diff --git a/src/fuzzy_storage.h b/src/fuzzy_storage.h
index e2803c52e..b9997da8b 100644
--- a/src/fuzzy_storage.h
+++ b/src/fuzzy_storage.h
@@ -3,7 +3,6 @@
#include "config.h"
#include "main.h"
-#include "fuzzy.h"
#include "shingles.h"
#define RSPAMD_FUZZY_VERSION 3
@@ -13,14 +12,6 @@
#define FUZZY_WRITE 1
#define FUZZY_DEL 2
-struct legacy_fuzzy_cmd {
- u_char cmd;
- guint32 blocksize;
- gint32 value;
- gint32 flag;
- u_char hash[FUZZY_HASHLEN];
-};
-
RSPAMD_PACKED(rspamd_fuzzy_cmd) {
guint8 version;
guint8 cmd;
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 04e7cd5f3..b509b23cd 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -7,7 +7,6 @@
#define RSPAMD_MESSAGE_H
#include "config.h"
-#include "fuzzy.h"
struct rspamd_task;
struct controller_session;
@@ -43,11 +42,8 @@ struct mime_text_part {
GByteArray *content;
struct html_content *html;
GList *urls_offset; /**< list of offsets of urls */
- rspamd_fuzzy_t *fuzzy;
- rspamd_fuzzy_t *double_fuzzy;
GMimeObject *parent;
struct mime_part *mime_part;
- rspamd_fstring_t *diff_str;
GArray *words;
GArray *normalized_words;
guint nlines;
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index bff70c1b7..a4c02989e 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -27,7 +27,6 @@
#include "cfg_file.h"
#include "main.h"
#include "message.h"
-#include "fuzzy.h"
#include "mime_expressions.h"
#include "html.h"
#include "lua/lua_common.h"
diff --git a/src/libserver/fuzzy_backend.c b/src/libserver/fuzzy_backend.c
index 7cd4faa91..a5bf28c7c 100644
--- a/src/libserver/fuzzy_backend.c
+++ b/src/libserver/fuzzy_backend.c
@@ -24,20 +24,9 @@
#include "config.h"
#include "main.h"
#include "fuzzy_backend.h"
-#include "fuzzy_storage.h"
#include <sqlite3.h>
-/* Magic sequence for hashes file */
-#define FUZZY_FILE_MAGIC "rsh"
-
-struct rspamd_legacy_fuzzy_node {
- gint32 value;
- gint32 flag;
- guint64 time;
- rspamd_fuzzy_t h;
-};
-
struct rspamd_fuzzy_backend {
sqlite3 *db;
char *path;
@@ -45,7 +34,6 @@ struct rspamd_fuzzy_backend {
gsize expired;
};
-
static const char *create_tables_sql =
"BEGIN;"
"CREATE TABLE digests("
@@ -393,80 +381,11 @@ rspamd_fuzzy_backend_open_db (const gchar *path, GError **err)
return bk;
}
-/*
- * Convert old database to the new format
- */
-static gboolean
-rspamd_fuzzy_backend_convert (const gchar *path, int fd, GError **err)
-{
- gchar tmpdb[PATH_MAX];
- struct rspamd_fuzzy_backend *nbackend;
- struct stat st;
- gint off;
- guint8 *map, *p, *end;
- struct rspamd_legacy_fuzzy_node *n;
-
- rspamd_snprintf (tmpdb, sizeof (tmpdb), "%s.converted", path);
- (void)unlink (tmpdb);
- nbackend = rspamd_fuzzy_backend_create_db (tmpdb, FALSE, err);
-
- if (nbackend == NULL) {
- return FALSE;
- }
-
- (void)fstat (fd, &st);
- (void)lseek (fd, 0, SEEK_SET);
-
- off = sizeof (FUZZY_FILE_MAGIC);
- if (off >= st.st_size) {
- msg_warn ("old fuzzy storage is empty or corrupted, remove it");
- }
- else {
- if ((map = mmap (NULL, st.st_size - off, PROT_READ, MAP_SHARED, fd,
- 0)) == MAP_FAILED) {
- g_set_error (err, rspamd_fuzzy_backend_quark (),
- errno, "Cannot mmap file %s: %s",
- path, strerror (errno));
- rspamd_fuzzy_backend_close (nbackend);
-
- return FALSE;
- }
-
- end = map + st.st_size;
- p = map + off;
-
- rspamd_fuzzy_backend_run_simple (RSPAMD_FUZZY_BACKEND_TRANSACTION_START,
- nbackend, NULL);
- while (p < end) {
- n = (struct rspamd_legacy_fuzzy_node *)p;
- /* Convert node flag, digest, value, time */
- if (rspamd_fuzzy_backend_run_stmt (nbackend, RSPAMD_FUZZY_BACKEND_INSERT,
- (gint)n->flag, n->h.hash_pipe,
- (gint64)n->value, n->time) != SQLITE_OK) {
- msg_warn ("Cannot execute init sql %s: %s",
- prepared_stmts[RSPAMD_FUZZY_BACKEND_INSERT].sql,
- sqlite3_errmsg (nbackend->db));
- }
- p += sizeof (struct rspamd_legacy_fuzzy_node);
- }
-
- munmap (map, st.st_size);
- rspamd_fuzzy_backend_run_simple (RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT,
- nbackend, NULL);
- }
-
- rspamd_fuzzy_backend_run_sql (create_index_sql, nbackend, NULL);
- rspamd_fuzzy_backend_close (nbackend);
- rename (tmpdb, path);
-
- return TRUE;
-}
-
struct rspamd_fuzzy_backend*
rspamd_fuzzy_backend_open (const gchar *path, GError **err)
{
- gchar *dir, header[4];
- gint fd, r;
+ gchar *dir;
+ gint fd;
struct rspamd_fuzzy_backend *res;
static const char sqlite_wal[] = "PRAGMA journal_mode=\"wal\";",
fallback_journal[] = "PRAGMA journal_mode=\"off\";";
@@ -501,21 +420,6 @@ rspamd_fuzzy_backend_open (const gchar *path, GError **err)
return NULL;
}
}
- else {
-
- /* Check for legacy format */
- if ((r = read (fd, header, sizeof (header))) == sizeof (header)) {
- if (memcmp (header, FUZZY_FILE_MAGIC, sizeof (header) - 1) == 0) {
- msg_info ("Trying to convert old fuzzy database");
- if (!rspamd_fuzzy_backend_convert (path, fd, err)) {
- close (fd);
- return NULL;
- }
- msg_info ("Old database converted");
- }
- close (fd);
- }
- }
close (fd);
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index a4d78427f..1fedbbb46 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -645,20 +645,9 @@ write_hashes_to_log (struct rspamd_task *task, GString *logbuf)
struct mime_text_part *text_part;
guint i;
+ /* TODO: rework parts hashes */
for (i = 0; i < task->text_parts->len; i ++) {
text_part = g_ptr_array_index (task->text_parts, i);
-
- if (text_part->fuzzy) {
- if (i != task->text_parts->len - 1) {
- rspamd_printf_gstring (logbuf,
- " part: %Xd,",
- text_part->fuzzy->h);
- }
- else {
- rspamd_printf_gstring (logbuf, " part: %Xd",
- text_part->fuzzy->h);
- }
- }
}
}
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index 61e5d6d15..338a74027 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -6,7 +6,6 @@ SET(LIBRSPAMDUTILSRC
${CMAKE_CURRENT_SOURCE_DIR}/diff.c
${CMAKE_CURRENT_SOURCE_DIR}/expression.c
${CMAKE_CURRENT_SOURCE_DIR}/fstring.c
- ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy.c
${CMAKE_CURRENT_SOURCE_DIR}/hash.c
${CMAKE_CURRENT_SOURCE_DIR}/http.c
${CMAKE_CURRENT_SOURCE_DIR}/keypairs_cache.c
diff --git a/src/libutil/fuzzy.c b/src/libutil/fuzzy.c
deleted file mode 100644
index 218065b77..000000000
--- a/src/libutil/fuzzy.c
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "fuzzy.h"
-#include "message.h"
-#include "url.h"
-#include "main.h"
-#include "xxhash.h"
-
-#define ROLL_WINDOW_SIZE 9
-#define MIN_FUZZY_BLOCK_SIZE 3
-#define HASH_INIT 0x28021967
-
-static const char *b64 =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-struct roll_state {
- guint32 h[3];
- gchar window[ROLL_WINDOW_SIZE];
- gint n;
-};
-
-static struct roll_state rs;
-
-
-/* Rolling hash function based on Adler-32 checksum */
-static guint32
-fuzzy_roll_hash (guint c)
-{
- /* Check window position */
- if (rs.n == ROLL_WINDOW_SIZE) {
- rs.n = 0;
- }
-
- rs.h[1] -= rs.h[0];
- rs.h[1] += ROLL_WINDOW_SIZE * c;
-
- rs.h[0] += c;
- rs.h[0] -= rs.window[rs.n];
-
- /* Save current symbol */
- rs.window[rs.n] = c;
- rs.n++;
-
- rs.h[2] <<= 5;
- rs.h[2] ^= c;
-
- return rs.h[0] + rs.h[1] + rs.h[2];
-}
-
-/* A simple non-rolling hash, based on the FNV hash */
-static guint32
-fuzzy_fnv_hash (guint c, guint32 hval)
-{
- hval ^= c;
- hval +=
- (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
- return hval;
-}
-
-/* Calculate blocksize depending on length of input */
-static guint32
-fuzzy_blocksize (guint32 len)
-{
- guint32 nlen = MIN_FUZZY_BLOCK_SIZE;
-
- while (nlen * (FUZZY_HASHLEN - 1) < len) {
- nlen *= 2;
- }
- return nlen;
-}
-
-
-/* Update hash with new symbol */
-static void
-fuzzy_update (rspamd_fuzzy_t * h, guint c)
-{
- h->rh = fuzzy_roll_hash (c);
- h->h = fuzzy_fnv_hash (c, h->h);
-
- if (h->rh % h->block_size == (h->block_size - 1)) {
- h->hash_pipe[h->hi] = b64[h->h % 64];
- if (h->hi < FUZZY_HASHLEN - 2) {
- h->h = HASH_INIT;
- h->hi++;
- }
- }
-}
-
-static void
-fuzzy_update2 (rspamd_fuzzy_t * h1, rspamd_fuzzy_t *h2, guint c)
-{
- h1->rh = fuzzy_roll_hash (c);
- h1->h = fuzzy_fnv_hash (c, h1->h);
- h2->rh = h1->rh;
- h2->h = fuzzy_fnv_hash (c, h2->h);
-
- if (h1->rh % h1->block_size == (h1->block_size - 1)) {
- h1->hash_pipe[h1->hi] = b64[h1->h % 64];
- if (h1->hi < FUZZY_HASHLEN - 2) {
- h1->h = HASH_INIT;
- h1->hi++;
- }
- }
- if (h2->rh % h2->block_size == (h2->block_size - 1)) {
- h2->hash_pipe[h2->hi] = b64[h2->h % 64];
- if (h2->hi < FUZZY_HASHLEN - 2) {
- h2->h = HASH_INIT;
- h2->hi++;
- }
- }
-}
-
-/*
- * Levenshtein distance between string1 and string2.
- *
- * Replace cost is normally 1, and 2 with nonzero xcost.
- */
-guint32
-rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2)
-{
- gint i;
- gint *row; /* we only need to keep one row of costs */
- gint *end;
- gint half, nx;
- gchar *sx, *char2p, char1;
- gint *p, D, x, offset, c3;
-
- /* strip common prefix */
- while (len1 > 0 && len2 > 0 && *s1 == *s2) {
- len1--;
- len2--;
- s1++;
- s2++;
- }
-
- /* strip common suffix */
- while (len1 > 0 && len2 > 0 && s1[len1 - 1] == s2[len2 - 1]) {
- len1--;
- len2--;
- }
-
- /* catch trivial cases */
- if (len1 == 0) {
- return len2;
- }
-
- if (len2 == 0) {
- return len1;
- }
-
- /* make the inner cycle (i.e. string2) the longer one */
- if (len1 > len2) {
- nx = len1;
- sx = s1;
- len1 = len2;
- len2 = nx;
- s1 = s2;
- s2 = sx;
- }
- /* check len1 == 1 separately */
- if (len1 == 1) {
- return len2 - (memchr (s2, *s1, len2) != NULL);
- }
-
- len1++;
- len2++;
- half = len1 >> 1;
-
- /* initalize first row */
- row = g_malloc (len2 * sizeof (gint));
- end = row + len2 - 1;
- for (i = 0; i < len2; i++) {
- row[i] = i;
- }
-
- /* in this case we don't have to scan two corner triangles (of size len1/2)
- * in the matrix because no best path can go throught them. note this
- * breaks when len1 == len2 == 2 so the memchr() special case above is
- * necessary */
- row[0] = len1 - half - 1;
- for (i = 1; i < len1; i++) {
- char1 = s1[i - 1];
- /* skip the upper triangle */
- if (i >= len1 - half) {
- offset = i - (len1 - half);
- char2p = s2 + offset;
- p = row + offset;
- c3 = *(p++) + (char1 != *(char2p++));
- x = *p;
- x++;
- D = x;
- if (x > c3)
- x = c3;
- *(p++) = x;
- }
- else {
- p = row + 1;
- char2p = s2;
- D = x = i;
- }
- /* skip the lower triangle */
- if (i <= half + 1)
- end = row + len2 + i - half - 2;
- /* main */
- while (p <= end) {
- c3 = --D + (char1 != *(char2p++));
- x++;
- if (x > c3)
- x = c3;
- D = *p;
- D++;
- if (x > D)
- x = D;
- *(p++) = x;
- }
- /* lower triangle sentinel */
- if (i <= half) {
- c3 = --D + (char1 != *char2p);
- x++;
- if (x > c3)
- x = c3;
- *p = x;
- }
- }
-
- i = *end;
- g_free (row);
- return i;
-}
-
-/* Calculate fuzzy hash for specified string */
-rspamd_fuzzy_t *
-rspamd_fuzzy_init (rspamd_fstring_t * in, rspamd_mempool_t * pool)
-{
- rspamd_fuzzy_t *new;
- guint i, repeats = 0;
- gchar *c = in->begin, last = '\0';
- gsize real_len = 0;
-
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
- bzero (&rs, sizeof (rs));
- for (i = 0; i < in->len; i++) {
- if (*c == last) {
- repeats++;
- }
- else {
- repeats = 0;
- }
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
- real_len++;
- }
- last = *c;
- c++;
- }
-
- new->block_size = fuzzy_blocksize (real_len);
- c = in->begin;
-
- for (i = 0; i < in->len; i++) {
- if (*c == last) {
- repeats++;
- }
- else {
- repeats = 0;
- }
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
- fuzzy_update (new, *c);
- }
- last = *c;
- c++;
- }
-
- /* Check whether we have more bytes in a rolling window */
- if (new->rh != 0) {
- new->hash_pipe[new->hi] = b64[new->h % 64];
- }
-
- return new;
-}
-
-rspamd_fuzzy_t *
-rspamd_fuzzy_from_byte_array (GByteArray * in, rspamd_mempool_t * pool)
-{
- rspamd_fstring_t f;
-
- f.begin = (gchar *)in->data;
- f.len = in->len;
-
- return rspamd_fuzzy_init (&f, pool);
-}
-
-void
-rspamd_fuzzy_from_text_part (struct mime_text_part *part,
- rspamd_mempool_t *pool,
- gsize max_diff)
-{
- rspamd_fuzzy_t *new, *new2;
- gchar *c, *end, *begin, *p;
- gsize real_len = 0, len = part->content->len;
- GList *cur_offset;
- struct process_exception *cur_ex = NULL;
- gunichar uc;
- gboolean write_diff = FALSE;
-
- cur_offset = part->urls_offset;
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
-
- begin = (gchar *)part->content->data;
- c = begin;
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
- new2 = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
- bzero (&rs, sizeof (rs));
- end = c + len;
-
- if (IS_PART_UTF (part)) {
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- uc = g_utf8_get_char (c);
- if (g_unichar_isalnum (uc)) {
- p = g_utf8_next_char (c);
- real_len += p - c;
- }
- else {
- p = g_utf8_next_char (c);
- }
- c = p;
- }
- }
- }
- else {
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
- real_len++;
- }
- c++;
- }
- }
- }
-
- write_diff = real_len > 0 && real_len < max_diff;
-
- if (write_diff) {
- part->diff_str = rspamd_fstralloc (pool, real_len + 1);
- }
- else {
- part->diff_str = NULL;
- }
-
- new->block_size = fuzzy_blocksize (real_len);
- new2->block_size = new->block_size * 2;
-
- cur_offset = part->urls_offset;
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
-
- begin = (gchar *)part->content->data;
- c = begin;
- end = c + len;
- if (IS_PART_UTF (part)) {
-
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- uc = g_utf8_get_char (c);
- if (g_unichar_isalnum (uc)) {
- fuzzy_update2 (new, new2, uc);
- if (write_diff) {
- rspamd_fstrappend_u (part->diff_str, uc);
- }
- }
- c = g_utf8_next_char (c);
- }
- }
- }
- else {
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
- fuzzy_update2 (new, new2, *c);
- if (write_diff) {
- rspamd_fstrappend_c (part->diff_str, *c);
- }
- }
- c++;
- }
- }
- }
-
- /* Check whether we have more bytes in a rolling window */
- if (new->rh != 0) {
- new->hash_pipe[new->hi] = b64[new->h % 64];
- }
- if (new2->rh != 0) {
- new2->hash_pipe[new2->hi] = b64[new2->h % 64];
- }
-
- part->fuzzy = new;
- part->double_fuzzy = new2;
-}
-
-/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
-gint
-rspamd_fuzzy_compare (rspamd_fuzzy_t * h1, rspamd_fuzzy_t * h2)
-{
- gint res, l1, l2;
-
- /* If we have hashes of different size, input strings are too different */
- if (h1->block_size != h2->block_size) {
- return 0;
- }
-
- l1 = strlen (h1->hash_pipe);
- l2 = strlen (h2->hash_pipe);
-
- if (l1 == 0 || l2 == 0) {
- if (l1 == 0 && l2 == 0) {
- return 100;
- }
- else {
- return 0;
- }
- }
-
- res = rspamd_levinstein_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
- res = 100 - (2 * res * 100) / (l1 + l2);
-
- return res;
-}
-
-gint
-rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2)
-{
- if (p1->fuzzy != NULL && p2->fuzzy != NULL) {
- if (p1->fuzzy->block_size == p2->fuzzy->block_size) {
- return rspamd_fuzzy_compare (p1->fuzzy, p2->fuzzy);
- }
- else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) {
- return rspamd_fuzzy_compare (p1->double_fuzzy, p2->fuzzy);
- }
- else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) {
- return rspamd_fuzzy_compare (p2->double_fuzzy, p1->fuzzy);
- }
- }
-
- return 0;
-}
-
-gint
-rspamd_fuzzy_len (rspamd_fuzzy_t *h)
-{
- gint len;
- void *nullpos;
-
- nullpos = memchr (h->hash_pipe, '\0', sizeof (h->hash_pipe));
-
- if (nullpos == NULL) {
- len = sizeof (h->hash_pipe);
- }
- else {
- len = (char *)nullpos - h->hash_pipe;
- }
-
- return len;
-}
-
-guint
-rspamd_fuzzy_hash (gconstpointer key)
-{
- rspamd_fuzzy_t *fh = (rspamd_fuzzy_t *)key;
- XXH64_state_t xxh;
-
- XXH64_reset (&xxh, rspamd_hash_seed ());
-
- XXH64_update (&xxh, &fh->block_size, sizeof (fh->block_size));
- XXH64_update (&xxh, fh->hash_pipe, rspamd_fuzzy_len (fh));
-
- return XXH64_digest (&xxh);
-}
-
-gboolean
-rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2)
-{
- rspamd_fuzzy_t *fh1= (rspamd_fuzzy_t *)v1,
- *fh2 = (rspamd_fuzzy_t *)v2;
-
- if (fh1->block_size == fh2->block_size) {
- gint l1 = rspamd_fuzzy_len (fh1),
- l2 = rspamd_fuzzy_len (fh2);
-
- if (l1 == l2) {
- return (memcmp (fh1->hash_pipe, fh2->hash_pipe, l1) == 0);
- }
- }
-
- return FALSE;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/libutil/fuzzy.h b/src/libutil/fuzzy.h
deleted file mode 100644
index 813599c6b..000000000
--- a/src/libutil/fuzzy.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * @file fuzzy.h
- * Fuzzy hashes API
- */
-
-#ifndef RSPAMD_FUZZY_H
-#define RSPAMD_FUZZY_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-
-#define FUZZY_HASHLEN 64
-
-typedef struct fuzzy_hash_s {
- gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */
- guint32 block_size; /**< current blocksize */
- guint32 rh; /**< roll hash value */
- guint32 h; /**< hash of block */
- guint32 hi; /**< current index in hash pipe */
-} rspamd_fuzzy_t;
-
-struct mime_text_part;
-
-/**
- * Calculate fuzzy hash for specified string
- * @param in input string
- * @param pool pool object
- * @return fuzzy_hash object allocated in pool
- */
-rspamd_fuzzy_t * rspamd_fuzzy_init (rspamd_fstring_t *in, rspamd_mempool_t *pool);
-/**
- * Calculate fuzzy hash for specified byte array
- * @param in input string
- * @param pool pool object
- * @return fuzzy_hash object allocated in pool
- */
-rspamd_fuzzy_t * rspamd_fuzzy_from_byte_array (GByteArray *in, rspamd_mempool_t *pool);
-
-/**
- * Calculate fuzzy hash for specified text part
- * @param part text part object
- * @param pool pool object
- * @param max_diff maximum text length to use diff algorithm in comparasions
- * @return fuzzy_hash object allocated in pool
- */
-void rspamd_fuzzy_from_text_part (struct mime_text_part *part,
- rspamd_mempool_t *pool,
- gsize max_diff);
-
-/**
- * Compare score of difference between two hashes
- * @param h1 first hash
- * @param h2 second hash
- * @return result in percents 0 - different hashes, 100 - identical hashes
- */
-gint rspamd_fuzzy_compare (rspamd_fuzzy_t *h1, rspamd_fuzzy_t *h2);
-
-/*
- * Compare two text parts and return percents of difference
- */
-gint rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);
-
-/*
- * Calculate levenstein distance between two strings. Note: this algorithm should be used
- * only for short texts - it runs too slow on long ones.
- */
-guint32 rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2);
-
-/*
- * Hash table utilities
- */
-gint rspamd_fuzzy_len (rspamd_fuzzy_t *h);
-guint rspamd_fuzzy_hash (gconstpointer key);
-gboolean rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2);
-
-#endif
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 085295b18..323ebfd32 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -86,12 +86,6 @@ LUA_FUNCTION_DEF (textpart, is_empty);
*/
LUA_FUNCTION_DEF (textpart, is_html);
/***
- * @method text_part:get_fuzzy()
- * Returns base32 encoded value of fuzzy hash of the specified part
- * @return {string} fuzzy hash value
- */
-LUA_FUNCTION_DEF (textpart, get_fuzzy);
-/***
* @method text_part:get_language()
* Returns the code of the most used unicode script in the text part. Does not work with raw parts
* @return {string} short abbreviation (such as `ru`) for the script's language
@@ -103,16 +97,6 @@ LUA_FUNCTION_DEF (textpart, get_language);
* @return {mimepart} mimepart object
*/
LUA_FUNCTION_DEF (textpart, get_mimepart);
-/***
- * @method text_part:compare_distance(other)
- * Calculates the difference to another text part. This function is intended to work with
- * the parts of `multipart/alternative` container only. If the two parts are not the parts of the
- * same `multipart/alternative` container, then they are considered as unrelated and
- * `-1` is returned.
- * @param {text_part} other text part to compare
- * @return {integer} commodity percentage (e.g. the same strings give `100`, different give `0` and unrelated give `-1`)
- */
-LUA_FUNCTION_DEF (textpart, compare_distance);
static const struct luaL_reg textpartlib_m[] = {
LUA_INTERFACE_DEF (textpart, is_utf),
@@ -121,10 +105,8 @@ static const struct luaL_reg textpartlib_m[] = {
LUA_INTERFACE_DEF (textpart, get_lines_count),
LUA_INTERFACE_DEF (textpart, is_empty),
LUA_INTERFACE_DEF (textpart, is_html),
- LUA_INTERFACE_DEF (textpart, get_fuzzy),
LUA_INTERFACE_DEF (textpart, get_language),
LUA_INTERFACE_DEF (textpart, get_mimepart),
- LUA_INTERFACE_DEF (textpart, compare_distance),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}
};
@@ -353,24 +335,6 @@ lua_textpart_is_html (lua_State * L)
return 1;
}
-static gint
-lua_textpart_get_fuzzy (lua_State * L)
-{
- struct mime_text_part *part = lua_check_textpart (L);
- gchar *out;
-
- if (part == NULL || IS_PART_EMPTY (part)) {
- lua_pushnil (L);
- return 1;
- }
-
- out = rspamd_encode_base32 (part->fuzzy->hash_pipe,
- strlen (part->fuzzy->hash_pipe));
- lua_pushstring (L, out);
- g_free (out);
-
- return 1;
-}
static gint
lua_textpart_get_language (lua_State * L)
@@ -408,60 +372,6 @@ lua_textpart_get_mimepart (lua_State * L)
return 1;
}
-static gint
-lua_textpart_compare_distance (lua_State * L)
-{
- struct mime_text_part *part = lua_check_textpart (L), *other;
- void *ud = luaL_checkudata (L, 2, "rspamd{textpart}");
- gint diff = -1;
- GMimeObject *parent;
- const GMimeContentType *ct;
-
- luaL_argcheck (L, ud != NULL, 2, "'textpart' expected");
- other = ud ? *((struct mime_text_part **)ud) : NULL;
-
- if (other != NULL && part->parent && part->parent == other->parent) {
- parent = part->parent;
- ct = g_mime_object_get_content_type (parent);
-#ifndef GMIME24
- if (ct == NULL ||
- !g_mime_content_type_is_type (ct, "multipart", "alternative")) {
-#else
- if (ct == NULL ||
- !g_mime_content_type_is_type ((GMimeContentType *)ct, "multipart",
- "alternative")) {
-#endif
- diff = -1;
-
- }
- else {
- if (!IS_PART_EMPTY (part) && !IS_PART_EMPTY (other)) {
- if (part->diff_str != NULL && other->diff_str != NULL) {
- diff = rspamd_diff_distance (part->diff_str,
- other->diff_str);
- }
- else {
- diff = rspamd_fuzzy_compare_parts (part, other);
- }
- }
- else if ((IS_PART_EMPTY (part) &&
- !IS_PART_EMPTY (other)) || (!IS_PART_EMPTY (part) &&
- IS_PART_EMPTY (other))) {
- /* Empty and non empty parts are different */
- diff = 0;
- }
- }
- }
- else {
- diff = -1;
- }
-
-
- lua_pushinteger (L, diff);
-
- return 1;
-}
-
/* Mimepart implementation */
static gint
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index ec849da54..857033ec0 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -563,46 +563,34 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
rspamd_fstring_t *word;
GArray *words;
- if (legacy || part->words == NULL || part->words->len == 0) {
- cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd));
+ shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd));
- cmd->shingles_count = 0;
- rspamd_strlcpy (cmd->digest, part->fuzzy->hash_pipe, sizeof (cmd->digest));
+ /*
+ * Generate hash from all words in the part
+ */
+ g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str,
+ rule->hash_key->len) != -1);
+ words = fuzzy_preprocess_words (part, pool);
- if (size != NULL) {
- *size = sizeof (struct rspamd_fuzzy_cmd);
- }
+ for (i = 0; i < words->len; i ++) {
+ word = &g_array_index (words, rspamd_fstring_t, i);
+ blake2b_update (&st, word->begin, word->len);
}
- else {
- shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd));
-
- /*
- * Generate hash from all words in the part
- */
- g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str,
- rule->hash_key->len) != -1);
- words = fuzzy_preprocess_words (part, pool);
+ blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest));
- for (i = 0; i < words->len; i ++) {
- word = &g_array_index (words, rspamd_fstring_t, i);
- blake2b_update (&st, word->begin, word->len);
- }
- blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest));
-
- msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str);
- sh = rspamd_shingles_generate (words,
- rule->shingles_key->str, pool,
- rspamd_shingles_default_filter, NULL);
- if (sh != NULL) {
- memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
- shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
- }
+ msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str);
+ sh = rspamd_shingles_generate (words,
+ rule->shingles_key->str, pool,
+ rspamd_shingles_default_filter, NULL);
+ if (sh != NULL) {
+ memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
+ shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+ }
- cmd = (struct rspamd_fuzzy_cmd *)shcmd;
+ cmd = (struct rspamd_fuzzy_cmd *)shcmd;
- if (size != NULL) {
- *size = sizeof (struct rspamd_fuzzy_shingle_cmd);
- }
+ if (size != NULL) {
+ *size = sizeof (struct rspamd_fuzzy_shingle_cmd);
}
cmd->tag = ottery_rand_uint32 ();
@@ -959,7 +947,6 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
struct mime_part *mime_part;
struct rspamd_image *image;
struct rspamd_fuzzy_cmd *cmd;
- gsize hashlen;
guint i;
GPtrArray *res;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7c3312634..584bbbd4d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,6 +1,5 @@
SET(TESTSRC rspamd_mem_pool_test.c
rspamd_statfile_test.c
- rspamd_fuzzy_test.c
rspamd_url_test.c
rspamd_dns_test.c
rspamd_async_test.c
diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c
deleted file mode 100644
index b1f1f5dcd..000000000
--- a/test/rspamd_fuzzy_test.c
+++ /dev/null
@@ -1,76 +0,0 @@
-#include "config.h"
-#include "main.h"
-#include "fuzzy.h"
-#include "tests.h"
-
-static char *s1 = "This is sample test text.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n";
-static char *s2 = "This is sample test text.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopzrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n";
-static char *s3 = "";
-static char *s4 = "abcdefghijklmn\r\n";
-static char *s5 = "This is sample test text.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopzrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n"
- "abcdefghijklmnopqrstuvwx.\r\n";
-
-void
-rspamd_fuzzy_test_func ()
-{
- rspamd_mempool_t *pool;
- rspamd_fuzzy_t *h1, *h2, *h3, *h4, *h5;
- rspamd_fstring_t f1, f2, f3, f4, f5;
- int diff2;
-
- pool = rspamd_mempool_new (1024);
- f1.begin = s1;
- f1.len = strlen (s1);
- f2.begin = s2;
- f2.len = strlen (s2);
- f3.begin = s3;
- f3.len = strlen (s3);
- f4.begin = s4;
- f4.len = strlen (s4);
- f5.begin = s5;
- f5.len = strlen (s5);
-
- h1 = rspamd_fuzzy_init (&f1, pool);
- h2 = rspamd_fuzzy_init (&f2, pool);
- h3 = rspamd_fuzzy_init (&f3, pool);
- h4 = rspamd_fuzzy_init (&f4, pool);
- h5 = rspamd_fuzzy_init (&f5, pool);
-
- diff2 = rspamd_fuzzy_compare (h2, h5);
- msg_debug ("rspamd_fuzzy_test_func: s1, s2 difference between strings is %d", rspamd_fuzzy_compare (h1, h2));
- msg_debug ("rspamd_fuzzy_test_func: s1, s3 difference between strings is %d", rspamd_fuzzy_compare (h1, h3));
- msg_debug ("rspamd_fuzzy_test_func: s3, s4 difference between strings is %d", rspamd_fuzzy_compare (h3, h4));
- msg_debug ("rspamd_fuzzy_test_func: s2, s4 difference between strings is %d", rspamd_fuzzy_compare (h2, h4));
- msg_debug ("rspamd_fuzzy_test_func: s2, s5 difference between strings is %d", diff2);
-
- /* Identical strings */
- if (diff2 != 100) {
- msg_err ("hash difference is %d", diff2);
- g_assert (diff2 == 100);
- }
-
- rspamd_mempool_delete (pool);
-}
diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c
index 5dc854560..c1a2e27f5 100644
--- a/test/rspamd_test_suite.c
+++ b/test/rspamd_test_suite.c
@@ -45,7 +45,6 @@ main (int argc, char **argv)
g_log_set_default_handler (rspamd_glib_log_function, rspamd_main->logger);
g_test_add_func ("/rspamd/mem_pool", rspamd_mem_pool_test_func);
- g_test_add_func ("/rspamd/fuzzy", rspamd_fuzzy_test_func);
g_test_add_func ("/rspamd/url", rspamd_url_test_func);
g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);
g_test_add_func ("/rspamd/radix", rspamd_radix_test_func);
diff --git a/test/tests.h b/test/tests.h
index f0dab9c02..a2ba05b84 100644
--- a/test/tests.h
+++ b/test/tests.h
@@ -11,9 +11,6 @@ void rspamd_url_test_func (void);
/* Memory pools */
void rspamd_mem_pool_test_func (void);
-/* Fuzzy hashes */
-void rspamd_fuzzy_test_func (void);
-
/* Stat file */
void rspamd_statfile_test_func (void);