/* * Copyright (c) 2009-2012, Vsevolod Stakhov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "mem_pool.h" #include "fstring.h" #include "fuzzy.h" #include "message.h" #include "url.h" #include "main.h" #include "xxhash.h" #define ROLL_WINDOW_SIZE 9 #define MIN_FUZZY_BLOCK_SIZE 3 #define HASH_INIT 0x28021967 static const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; struct roll_state { guint32 h[3]; gchar window[ROLL_WINDOW_SIZE]; gint n; }; static struct roll_state rs; /* Rolling hash function based on Adler-32 checksum */ static guint32 fuzzy_roll_hash (guint c) { /* Check window position */ if (rs.n == ROLL_WINDOW_SIZE) { rs.n = 0; } rs.h[1] -= rs.h[0]; rs.h[1] += ROLL_WINDOW_SIZE * c; rs.h[0] += c; rs.h[0] -= rs.window[rs.n]; /* Save current symbol */ rs.window[rs.n] = c; rs.n++; rs.h[2] <<= 5; rs.h[2] ^= c; return rs.h[0] + rs.h[1] + rs.h[2]; } /* A simple non-rolling hash, based on the FNV hash */ static guint32 fuzzy_fnv_hash (guint c, guint32 hval) { hval ^= c; hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); return hval; } /* Calculate blocksize depending on length of input */ static guint32 fuzzy_blocksize (guint32 len) { guint32 nlen = MIN_FUZZY_BLOCK_SIZE; while (nlen * (FUZZY_HASHLEN - 1) < len) { nlen *= 2; } return nlen; } /* Update hash with new symbol */ static void fuzzy_update (rspamd_fuzzy_t * h, guint c) { h->rh = fuzzy_roll_hash (c); h->h = fuzzy_fnv_hash (c, h->h); if (h->rh % h->block_size == (h->block_size - 1)) { h->hash_pipe[h->hi] = b64[h->h % 64]; if (h->hi < FUZZY_HASHLEN - 2) { h->h = HASH_INIT; h->hi++; } } } static void fuzzy_update2 (rspamd_fuzzy_t * h1, rspamd_fuzzy_t *h2, guint c) { h1->rh = fuzzy_roll_hash (c); h1->h = fuzzy_fnv_hash (c, h1->h); h2->rh = h1->rh; h2->h = fuzzy_fnv_hash (c, h2->h); if (h1->rh % h1->block_size == (h1->block_size - 1)) { h1->hash_pipe[h1->hi] = b64[h1->h % 64]; if (h1->hi < FUZZY_HASHLEN - 2) { h1->h = HASH_INIT; h1->hi++; } } if (h2->rh % h2->block_size == (h2->block_size - 1)) { h2->hash_pipe[h2->hi] = b64[h2->h % 64]; if (h2->hi < FUZZY_HASHLEN - 2) { h2->h = HASH_INIT; h2->hi++; } } } /* * Levenshtein distance between string1 and string2. * * Replace cost is normally 1, and 2 with nonzero xcost. */ guint32 rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2) { gint i; gint *row; /* we only need to keep one row of costs */ gint *end; gint half, nx; gchar *sx, *char2p, char1; gint *p, D, x, offset, c3; /* strip common prefix */ while (len1 > 0 && len2 > 0 && *s1 == *s2) { len1--; len2--; s1++; s2++; } /* strip common suffix */ while (len1 > 0 && len2 > 0 && s1[len1 - 1] == s2[len2 - 1]) { len1--; len2--; } /* catch trivial cases */ if (len1 == 0) { return len2; } if (len2 == 0) { return len1; } /* make the inner cycle (i.e. string2) the longer one */ if (len1 > len2) { nx = len1; sx = s1; len1 = len2; len2 = nx; s1 = s2; s2 = sx; } /* check len1 == 1 separately */ if (len1 == 1) { return len2 - (memchr (s2, *s1, len2) != NULL); } len1++; len2++; half = len1 >> 1; /* initalize first row */ row = g_malloc (len2 * sizeof (gint)); end = row + len2 - 1; for (i = 0; i < len2; i++) { row[i] = i; } /* in this case we don't have to scan two corner triangles (of size len1/2) * in the matrix because no best path can go throught them. note this * breaks when len1 == len2 == 2 so the memchr() special case above is * necessary */ row[0] = len1 - half - 1; for (i = 1; i < len1; i++) { char1 = s1[i - 1]; /* skip the upper triangle */ if (i >= len1 - half) { offset = i - (len1 - half); char2p = s2 + offset; p = row + offset; c3 = *(p++) + (char1 != *(char2p++)); x = *p; x++; D = x; if (x > c3) x = c3; *(p++) = x; } else { p = row + 1; char2p = s2; D = x = i; } /* skip the lower triangle */ if (i <= half + 1) end = row + len2 + i - half - 2; /* main */ while (p <= end) { c3 = --D + (char1 != *(char2p++)); x++; if (x > c3) x = c3; D = *p; D++; if (x > D) x = D; *(p++) = x; } /* lower triangle sentinel */ if (i <= half) { c3 = --D + (char1 != *char2p); x++; if (x > c3) x = c3; *p = x; } } i = *end; g_free (row); return i; } /* Calculate fuzzy hash for specified string */ rspamd_fuzzy_t * rspamd_fuzzy_init (rspamd_fstring_t * in, rspamd_mempool_t * pool) { rspamd_fuzzy_t *new; guint i, repeats = 0; gchar *c = in->begin, last = '\0'; gsize real_len = 0; new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t)); bzero (&rs, sizeof (rs)); for (i = 0; i < in->len; i++) { if (*c == last) { repeats++; } else { repeats = 0; } if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { real_len++; } last = *c; c++; } new->block_size = fuzzy_blocksize (real_len); c = in->begin; for (i = 0; i < in->len; i++) { if (*c == last) { repeats++; } else { repeats = 0; } if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { fuzzy_update (new, *c); } last = *c; c++; } /* Check whether we have more bytes in a rolling window */ if (new->rh != 0) { new->hash_pipe[new->hi] = b64[new->h % 64]; } return new; } rspamd_fuzzy_t * rspamd_fuzzy_from_byte_array (GByteArray * in, rspamd_mempool_t * pool) { rspamd_fstring_t f; f.begin = (gchar *)in->data; f.len = in->len; return rspamd_fuzzy_init (&f, pool); } void rspamd_fuzzy_from_text_part (struct mime_text_part *part, rspamd_mempool_t *pool, gsize max_diff) { rspamd_fuzzy_t *new, *new2; gchar *c, *end, *begin, *p; gsize real_len = 0, len = part->content->len; GList *cur_offset; struct process_exception *cur_ex = NULL; gunichar uc; gboolean write_diff = FALSE; cur_offset = part->urls_offset; if (cur_offset != NULL) { cur_ex = cur_offset->data; } begin = (gchar *)part->content->data; c = begin; new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t)); new2 = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t)); bzero (&rs, sizeof (rs)); end = c + len; if (IS_PART_UTF (part)) { while (c < end) { if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { c += cur_ex->len + 1; cur_offset = g_list_next (cur_offset); if (cur_offset != NULL) { cur_ex = cur_offset->data; } } else { uc = g_utf8_get_char (c); if (g_unichar_isalnum (uc)) { p = g_utf8_next_char (c); real_len += p - c; } else { p = g_utf8_next_char (c); } c = p; } } } else { while (c < end) { if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { c += cur_ex->len + 1; cur_offset = g_list_next (cur_offset); if (cur_offset != NULL) { cur_ex = cur_offset->data; } } else { if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { real_len++; } c++; } } } write_diff = real_len > 0 && real_len < max_diff; if (write_diff) { part->diff_str = rspamd_fstralloc (pool, real_len + 1); } else { part->diff_str = NULL; } new->block_size = fuzzy_blocksize (real_len); new2->block_size = new->block_size * 2; cur_offset = part->urls_offset; if (cur_offset != NULL) { cur_ex = cur_offset->data; } begin = (gchar *)part->content->data; c = begin; end = c + len; if (IS_PART_UTF (part)) { while (c < end) { if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { c += cur_ex->len + 1; cur_offset = g_list_next (cur_offset); if (cur_offset != NULL) { cur_ex = cur_offset->data; } } else { uc = g_utf8_get_char (c); if (g_unichar_isalnum (uc)) { fuzzy_update2 (new, new2, uc); if (write_diff) { rspamd_fstrappend_u (part->diff_str, uc); } } c = g_utf8_next_char (c); } } } else { while (c < end) { if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) { c += cur_ex->len + 1; cur_offset = g_list_next (cur_offset); if (cur_offset != NULL) { cur_ex = cur_offset->data; } } else { if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { fuzzy_update2 (new, new2, *c); if (write_diff) { rspamd_fstrappend_c (part->diff_str, *c); } } c++; } } } /* Check whether we have more bytes in a rolling window */ if (new->rh != 0) { new->hash_pipe[new->hi] = b64[new->h % 64]; } if (new2->rh != 0) { new2->hash_pipe[new2->hi] = b64[new2->h % 64]; } part->fuzzy = new; part->double_fuzzy = new2; } /* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ gint rspamd_fuzzy_compare (rspamd_fuzzy_t * h1, rspamd_fuzzy_t * h2) { gint res, l1, l2; /* If we have hashes of different size, input strings are too different */ if (h1->block_size != h2->block_size) { return 0; } l1 = strlen (h1->hash_pipe); l2 = strlen (h2->hash_pipe); if (l1 == 0 || l2 == 0) { if (l1 == 0 && l2 == 0) { return 100; } else { return 0; } } res = rspamd_levinstein_distance (h1->hash_pipe, l1, h2->hash_pipe, l2); res = 100 - (2 * res * 100) / (l1 + l2); return res; } gint rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2) { if (p1->fuzzy != NULL && p2->fuzzy != NULL) { if (p1->fuzzy->block_size == p2->fuzzy->block_size) { return rspamd_fuzzy_compare (p1->fuzzy, p2->fuzzy); } else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) { return rspamd_fuzzy_compare (p1->double_fuzzy, p2->fuzzy); } else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) { return rspamd_fuzzy_compare (p2->double_fuzzy, p1->fuzzy); } } return 0; } gint rspamd_fuzzy_len (rspamd_fuzzy_t *h) { gint len; void *nullpos; nullpos = memchr (h->hash_pipe, '\0', sizeof (h->hash_pipe)); if (nullpos == NULL) { len = sizeof (h->hash_pipe); } else { len = (char *)nullpos - h->hash_pipe; } return len; } guint rspamd_fuzzy_hash (gconstpointer key) { rspamd_fuzzy_t *fh = (rspamd_fuzzy_t *)key; XXH64_state_t xxh; XXH64_reset (&xxh, rspamd_hash_seed ()); XXH64_update (&xxh, &fh->block_size, sizeof (fh->block_size)); XXH64_update (&xxh, fh->hash_pipe, rspamd_fuzzy_len (fh)); return XXH64_digest (&xxh); } gboolean rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2) { rspamd_fuzzy_t *fh1= (rspamd_fuzzy_t *)v1, *fh2 = (rspamd_fuzzy_t *)v2; if (fh1->block_size == fh2->block_size) { gint l1 = rspamd_fuzzy_len (fh1), l2 = rspamd_fuzzy_len (fh2); if (l1 == l2) { return (memcmp (fh1->hash_pipe, fh2->hash_pipe, l1) == 0); } } return FALSE; } /* * vi:ts=4 */