summaryrefslogtreecommitdiffstats
path: root/src/libutil
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-18 00:10:56 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-18 00:10:56 +0100
commitab15b9a3c95d6c0d37330c96d8827ac59b2fee78 (patch)
tree70a9faf4dc79456cfa4aa804a5ce3dbe5c695024 /src/libutil
parentaf127078a26a41e6254d97f760c2afcfea2110ef (diff)
downloadrspamd-ab15b9a3c95d6c0d37330c96d8827ac59b2fee78.tar.gz
rspamd-ab15b9a3c95d6c0d37330c96d8827ac59b2fee78.zip
Remove legacy fuzzy code completely.
Diffstat (limited to 'src/libutil')
-rw-r--r--src/libutil/CMakeLists.txt1
-rw-r--r--src/libutil/fuzzy.c557
-rw-r--r--src/libutil/fuzzy.h77
3 files changed, 0 insertions, 635 deletions
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index 61e5d6d15..338a74027 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -6,7 +6,6 @@ SET(LIBRSPAMDUTILSRC
${CMAKE_CURRENT_SOURCE_DIR}/diff.c
${CMAKE_CURRENT_SOURCE_DIR}/expression.c
${CMAKE_CURRENT_SOURCE_DIR}/fstring.c
- ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy.c
${CMAKE_CURRENT_SOURCE_DIR}/hash.c
${CMAKE_CURRENT_SOURCE_DIR}/http.c
${CMAKE_CURRENT_SOURCE_DIR}/keypairs_cache.c
diff --git a/src/libutil/fuzzy.c b/src/libutil/fuzzy.c
deleted file mode 100644
index 218065b77..000000000
--- a/src/libutil/fuzzy.c
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "fuzzy.h"
-#include "message.h"
-#include "url.h"
-#include "main.h"
-#include "xxhash.h"
-
-#define ROLL_WINDOW_SIZE 9
-#define MIN_FUZZY_BLOCK_SIZE 3
-#define HASH_INIT 0x28021967
-
-static const char *b64 =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-struct roll_state {
- guint32 h[3];
- gchar window[ROLL_WINDOW_SIZE];
- gint n;
-};
-
-static struct roll_state rs;
-
-
-/* Rolling hash function based on Adler-32 checksum */
-static guint32
-fuzzy_roll_hash (guint c)
-{
- /* Check window position */
- if (rs.n == ROLL_WINDOW_SIZE) {
- rs.n = 0;
- }
-
- rs.h[1] -= rs.h[0];
- rs.h[1] += ROLL_WINDOW_SIZE * c;
-
- rs.h[0] += c;
- rs.h[0] -= rs.window[rs.n];
-
- /* Save current symbol */
- rs.window[rs.n] = c;
- rs.n++;
-
- rs.h[2] <<= 5;
- rs.h[2] ^= c;
-
- return rs.h[0] + rs.h[1] + rs.h[2];
-}
-
-/* A simple non-rolling hash, based on the FNV hash */
-static guint32
-fuzzy_fnv_hash (guint c, guint32 hval)
-{
- hval ^= c;
- hval +=
- (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
- return hval;
-}
-
-/* Calculate blocksize depending on length of input */
-static guint32
-fuzzy_blocksize (guint32 len)
-{
- guint32 nlen = MIN_FUZZY_BLOCK_SIZE;
-
- while (nlen * (FUZZY_HASHLEN - 1) < len) {
- nlen *= 2;
- }
- return nlen;
-}
-
-
-/* Update hash with new symbol */
-static void
-fuzzy_update (rspamd_fuzzy_t * h, guint c)
-{
- h->rh = fuzzy_roll_hash (c);
- h->h = fuzzy_fnv_hash (c, h->h);
-
- if (h->rh % h->block_size == (h->block_size - 1)) {
- h->hash_pipe[h->hi] = b64[h->h % 64];
- if (h->hi < FUZZY_HASHLEN - 2) {
- h->h = HASH_INIT;
- h->hi++;
- }
- }
-}
-
-static void
-fuzzy_update2 (rspamd_fuzzy_t * h1, rspamd_fuzzy_t *h2, guint c)
-{
- h1->rh = fuzzy_roll_hash (c);
- h1->h = fuzzy_fnv_hash (c, h1->h);
- h2->rh = h1->rh;
- h2->h = fuzzy_fnv_hash (c, h2->h);
-
- if (h1->rh % h1->block_size == (h1->block_size - 1)) {
- h1->hash_pipe[h1->hi] = b64[h1->h % 64];
- if (h1->hi < FUZZY_HASHLEN - 2) {
- h1->h = HASH_INIT;
- h1->hi++;
- }
- }
- if (h2->rh % h2->block_size == (h2->block_size - 1)) {
- h2->hash_pipe[h2->hi] = b64[h2->h % 64];
- if (h2->hi < FUZZY_HASHLEN - 2) {
- h2->h = HASH_INIT;
- h2->hi++;
- }
- }
-}
-
-/*
- * Levenshtein distance between string1 and string2.
- *
- * Replace cost is normally 1, and 2 with nonzero xcost.
- */
-guint32
-rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2)
-{
- gint i;
- gint *row; /* we only need to keep one row of costs */
- gint *end;
- gint half, nx;
- gchar *sx, *char2p, char1;
- gint *p, D, x, offset, c3;
-
- /* strip common prefix */
- while (len1 > 0 && len2 > 0 && *s1 == *s2) {
- len1--;
- len2--;
- s1++;
- s2++;
- }
-
- /* strip common suffix */
- while (len1 > 0 && len2 > 0 && s1[len1 - 1] == s2[len2 - 1]) {
- len1--;
- len2--;
- }
-
- /* catch trivial cases */
- if (len1 == 0) {
- return len2;
- }
-
- if (len2 == 0) {
- return len1;
- }
-
- /* make the inner cycle (i.e. string2) the longer one */
- if (len1 > len2) {
- nx = len1;
- sx = s1;
- len1 = len2;
- len2 = nx;
- s1 = s2;
- s2 = sx;
- }
- /* check len1 == 1 separately */
- if (len1 == 1) {
- return len2 - (memchr (s2, *s1, len2) != NULL);
- }
-
- len1++;
- len2++;
- half = len1 >> 1;
-
- /* initalize first row */
- row = g_malloc (len2 * sizeof (gint));
- end = row + len2 - 1;
- for (i = 0; i < len2; i++) {
- row[i] = i;
- }
-
- /* in this case we don't have to scan two corner triangles (of size len1/2)
- * in the matrix because no best path can go throught them. note this
- * breaks when len1 == len2 == 2 so the memchr() special case above is
- * necessary */
- row[0] = len1 - half - 1;
- for (i = 1; i < len1; i++) {
- char1 = s1[i - 1];
- /* skip the upper triangle */
- if (i >= len1 - half) {
- offset = i - (len1 - half);
- char2p = s2 + offset;
- p = row + offset;
- c3 = *(p++) + (char1 != *(char2p++));
- x = *p;
- x++;
- D = x;
- if (x > c3)
- x = c3;
- *(p++) = x;
- }
- else {
- p = row + 1;
- char2p = s2;
- D = x = i;
- }
- /* skip the lower triangle */
- if (i <= half + 1)
- end = row + len2 + i - half - 2;
- /* main */
- while (p <= end) {
- c3 = --D + (char1 != *(char2p++));
- x++;
- if (x > c3)
- x = c3;
- D = *p;
- D++;
- if (x > D)
- x = D;
- *(p++) = x;
- }
- /* lower triangle sentinel */
- if (i <= half) {
- c3 = --D + (char1 != *char2p);
- x++;
- if (x > c3)
- x = c3;
- *p = x;
- }
- }
-
- i = *end;
- g_free (row);
- return i;
-}
-
-/* Calculate fuzzy hash for specified string */
-rspamd_fuzzy_t *
-rspamd_fuzzy_init (rspamd_fstring_t * in, rspamd_mempool_t * pool)
-{
- rspamd_fuzzy_t *new;
- guint i, repeats = 0;
- gchar *c = in->begin, last = '\0';
- gsize real_len = 0;
-
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
- bzero (&rs, sizeof (rs));
- for (i = 0; i < in->len; i++) {
- if (*c == last) {
- repeats++;
- }
- else {
- repeats = 0;
- }
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
- real_len++;
- }
- last = *c;
- c++;
- }
-
- new->block_size = fuzzy_blocksize (real_len);
- c = in->begin;
-
- for (i = 0; i < in->len; i++) {
- if (*c == last) {
- repeats++;
- }
- else {
- repeats = 0;
- }
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
- fuzzy_update (new, *c);
- }
- last = *c;
- c++;
- }
-
- /* Check whether we have more bytes in a rolling window */
- if (new->rh != 0) {
- new->hash_pipe[new->hi] = b64[new->h % 64];
- }
-
- return new;
-}
-
-rspamd_fuzzy_t *
-rspamd_fuzzy_from_byte_array (GByteArray * in, rspamd_mempool_t * pool)
-{
- rspamd_fstring_t f;
-
- f.begin = (gchar *)in->data;
- f.len = in->len;
-
- return rspamd_fuzzy_init (&f, pool);
-}
-
-void
-rspamd_fuzzy_from_text_part (struct mime_text_part *part,
- rspamd_mempool_t *pool,
- gsize max_diff)
-{
- rspamd_fuzzy_t *new, *new2;
- gchar *c, *end, *begin, *p;
- gsize real_len = 0, len = part->content->len;
- GList *cur_offset;
- struct process_exception *cur_ex = NULL;
- gunichar uc;
- gboolean write_diff = FALSE;
-
- cur_offset = part->urls_offset;
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
-
- begin = (gchar *)part->content->data;
- c = begin;
- new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
- new2 = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
- bzero (&rs, sizeof (rs));
- end = c + len;
-
- if (IS_PART_UTF (part)) {
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- uc = g_utf8_get_char (c);
- if (g_unichar_isalnum (uc)) {
- p = g_utf8_next_char (c);
- real_len += p - c;
- }
- else {
- p = g_utf8_next_char (c);
- }
- c = p;
- }
- }
- }
- else {
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
- real_len++;
- }
- c++;
- }
- }
- }
-
- write_diff = real_len > 0 && real_len < max_diff;
-
- if (write_diff) {
- part->diff_str = rspamd_fstralloc (pool, real_len + 1);
- }
- else {
- part->diff_str = NULL;
- }
-
- new->block_size = fuzzy_blocksize (real_len);
- new2->block_size = new->block_size * 2;
-
- cur_offset = part->urls_offset;
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
-
- begin = (gchar *)part->content->data;
- c = begin;
- end = c + len;
- if (IS_PART_UTF (part)) {
-
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- uc = g_utf8_get_char (c);
- if (g_unichar_isalnum (uc)) {
- fuzzy_update2 (new, new2, uc);
- if (write_diff) {
- rspamd_fstrappend_u (part->diff_str, uc);
- }
- }
- c = g_utf8_next_char (c);
- }
- }
- }
- else {
- while (c < end) {
- if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
- c += cur_ex->len + 1;
- cur_offset = g_list_next (cur_offset);
- if (cur_offset != NULL) {
- cur_ex = cur_offset->data;
- }
- }
- else {
- if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
- fuzzy_update2 (new, new2, *c);
- if (write_diff) {
- rspamd_fstrappend_c (part->diff_str, *c);
- }
- }
- c++;
- }
- }
- }
-
- /* Check whether we have more bytes in a rolling window */
- if (new->rh != 0) {
- new->hash_pipe[new->hi] = b64[new->h % 64];
- }
- if (new2->rh != 0) {
- new2->hash_pipe[new2->hi] = b64[new2->h % 64];
- }
-
- part->fuzzy = new;
- part->double_fuzzy = new2;
-}
-
-/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
-gint
-rspamd_fuzzy_compare (rspamd_fuzzy_t * h1, rspamd_fuzzy_t * h2)
-{
- gint res, l1, l2;
-
- /* If we have hashes of different size, input strings are too different */
- if (h1->block_size != h2->block_size) {
- return 0;
- }
-
- l1 = strlen (h1->hash_pipe);
- l2 = strlen (h2->hash_pipe);
-
- if (l1 == 0 || l2 == 0) {
- if (l1 == 0 && l2 == 0) {
- return 100;
- }
- else {
- return 0;
- }
- }
-
- res = rspamd_levinstein_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
- res = 100 - (2 * res * 100) / (l1 + l2);
-
- return res;
-}
-
-gint
-rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2)
-{
- if (p1->fuzzy != NULL && p2->fuzzy != NULL) {
- if (p1->fuzzy->block_size == p2->fuzzy->block_size) {
- return rspamd_fuzzy_compare (p1->fuzzy, p2->fuzzy);
- }
- else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) {
- return rspamd_fuzzy_compare (p1->double_fuzzy, p2->fuzzy);
- }
- else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) {
- return rspamd_fuzzy_compare (p2->double_fuzzy, p1->fuzzy);
- }
- }
-
- return 0;
-}
-
-gint
-rspamd_fuzzy_len (rspamd_fuzzy_t *h)
-{
- gint len;
- void *nullpos;
-
- nullpos = memchr (h->hash_pipe, '\0', sizeof (h->hash_pipe));
-
- if (nullpos == NULL) {
- len = sizeof (h->hash_pipe);
- }
- else {
- len = (char *)nullpos - h->hash_pipe;
- }
-
- return len;
-}
-
-guint
-rspamd_fuzzy_hash (gconstpointer key)
-{
- rspamd_fuzzy_t *fh = (rspamd_fuzzy_t *)key;
- XXH64_state_t xxh;
-
- XXH64_reset (&xxh, rspamd_hash_seed ());
-
- XXH64_update (&xxh, &fh->block_size, sizeof (fh->block_size));
- XXH64_update (&xxh, fh->hash_pipe, rspamd_fuzzy_len (fh));
-
- return XXH64_digest (&xxh);
-}
-
-gboolean
-rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2)
-{
- rspamd_fuzzy_t *fh1= (rspamd_fuzzy_t *)v1,
- *fh2 = (rspamd_fuzzy_t *)v2;
-
- if (fh1->block_size == fh2->block_size) {
- gint l1 = rspamd_fuzzy_len (fh1),
- l2 = rspamd_fuzzy_len (fh2);
-
- if (l1 == l2) {
- return (memcmp (fh1->hash_pipe, fh2->hash_pipe, l1) == 0);
- }
- }
-
- return FALSE;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/libutil/fuzzy.h b/src/libutil/fuzzy.h
deleted file mode 100644
index 813599c6b..000000000
--- a/src/libutil/fuzzy.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * @file fuzzy.h
- * Fuzzy hashes API
- */
-
-#ifndef RSPAMD_FUZZY_H
-#define RSPAMD_FUZZY_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-
-#define FUZZY_HASHLEN 64
-
-typedef struct fuzzy_hash_s {
- gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */
- guint32 block_size; /**< current blocksize */
- guint32 rh; /**< roll hash value */
- guint32 h; /**< hash of block */
- guint32 hi; /**< current index in hash pipe */
-} rspamd_fuzzy_t;
-
-struct mime_text_part;
-
-/**
- * Calculate fuzzy hash for specified string
- * @param in input string
- * @param pool pool object
- * @return fuzzy_hash object allocated in pool
- */
-rspamd_fuzzy_t * rspamd_fuzzy_init (rspamd_fstring_t *in, rspamd_mempool_t *pool);
-/**
- * Calculate fuzzy hash for specified byte array
- * @param in input string
- * @param pool pool object
- * @return fuzzy_hash object allocated in pool
- */
-rspamd_fuzzy_t * rspamd_fuzzy_from_byte_array (GByteArray *in, rspamd_mempool_t *pool);
-
-/**
- * Calculate fuzzy hash for specified text part
- * @param part text part object
- * @param pool pool object
- * @param max_diff maximum text length to use diff algorithm in comparasions
- * @return fuzzy_hash object allocated in pool
- */
-void rspamd_fuzzy_from_text_part (struct mime_text_part *part,
- rspamd_mempool_t *pool,
- gsize max_diff);
-
-/**
- * Compare score of difference between two hashes
- * @param h1 first hash
- * @param h2 second hash
- * @return result in percents 0 - different hashes, 100 - identical hashes
- */
-gint rspamd_fuzzy_compare (rspamd_fuzzy_t *h1, rspamd_fuzzy_t *h2);
-
-/*
- * Compare two text parts and return percents of difference
- */
-gint rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);
-
-/*
- * Calculate levenstein distance between two strings. Note: this algorithm should be used
- * only for short texts - it runs too slow on long ones.
- */
-guint32 rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2);
-
-/*
- * Hash table utilities
- */
-gint rspamd_fuzzy_len (rspamd_fuzzy_t *h);
-guint rspamd_fuzzy_hash (gconstpointer key);
-gboolean rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2);
-
-#endif