aboutsummaryrefslogtreecommitdiffstats
path: root/src/fuzzy.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/fuzzy.c')
-rw-r--r--src/fuzzy.c273
1 files changed, 273 insertions, 0 deletions
diff --git a/src/fuzzy.c b/src/fuzzy.c
new file mode 100644
index 000000000..08814eaa1
--- /dev/null
+++ b/src/fuzzy.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "fuzzy.h"
+
+#define ROLL_WINDOW_SIZE 9
+#define MIN_FUZZY_BLOCK_SIZE 3
+#define HASH_INIT 0x28021967
+
+struct roll_state {
+ uint32_t h[3];
+ char window[ROLL_WINDOW_SIZE];
+ int n;
+};
+
+static struct roll_state rs;
+
+
+/* Rolling hash function based on Adler-32 checksum */
+static uint32_t
+fuzzy_roll_hash (char c)
+{
+ /* Check window position */
+ if (rs.n == ROLL_WINDOW_SIZE) {
+ rs.n = 0;
+ }
+
+ rs.h[1] -= rs.h[0];
+ rs.h[1] += ROLL_WINDOW_SIZE * c;
+
+ rs.h[0] += c;
+ rs.h[0] -= rs.window[rs.n];
+
+ /* Save current symbol */
+ rs.window[rs.n] = c;
+ rs.n ++;
+
+ rs.h[2] <<= 5;
+ rs.h[2] ^= c;
+
+ return rs.h[1] + rs.h[2] + rs.h[3];
+}
+
+/* A simple non-rolling hash, based on the FNV hash */
+static uint32_t
+fuzzy_fnv_hash (char c, uint32_t hval)
+{
+ hval ^= c;
+ hval = hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
+ return hval;
+}
+
+/* Calculate blocksize depending on length of input */
+static uint32_t
+fuzzy_blocksize (uint32_t len)
+{
+
+ if (len < MIN_FUZZY_BLOCK_SIZE) {
+ return MIN_FUZZY_BLOCK_SIZE;
+ }
+ return g_spaced_primes_closest (len / FUZZY_HASHLEN);
+}
+
+/* Update hash with new symbol */
+void
+fuzzy_update (fuzzy_hash_t *h, char c)
+{
+ h->rh = fuzzy_roll_hash (c);
+ h->h = fuzzy_fnv_hash(c, h->h);
+
+ if (h->rh % h->block_size == (h->block_size - 1)) {
+ h->hash_pipe[h->hi] = h->h;
+ if (h->hi < FUZZY_HASHLEN - 1) {
+ h->h = HASH_INIT;
+ h->hi ++;
+ }
+ }
+}
+
+/*
+ * Levenshtein distance between string1 and string2.
+ *
+ * Replace cost is normally 1, and 2 with nonzero xcost.
+ */
+static uint32_t
+lev_distance (char *s1, int len1, char *s2, int len2)
+{
+ int i;
+ int *row; /* we only need to keep one row of costs */
+ int *end;
+ int half, nx;
+ char *sx, *char2p, char1;
+ int *p, D, x, offset, c3;
+
+ /* strip common prefix */
+ while (len1 > 0 && len2 > 0 && *s1 == *s2) {
+ len1 --;
+ len2 --;
+ s1 ++;
+ s2 ++;
+ }
+
+ /* strip common suffix */
+ while (len1 > 0 && len2 > 0 && s1[len1-1] == s2[len2-1]) {
+ len1--;
+ len2--;
+ }
+
+ /* catch trivial cases */
+ if (len1 == 0) {
+ return len2;
+ }
+
+ if (len2 == 0) {
+ return len1;
+ }
+
+ /* make the inner cycle (i.e. string2) the longer one */
+ if (len1 > len2) {
+ nx = len1;
+ sx = s1;
+ len1 = len2;
+ len2 = nx;
+ s1 = s2;
+ s2 = sx;
+ }
+ /* check len1 == 1 separately */
+ if (len1 == 1) {
+ return len2 - (memchr (s2, *s1, len2) != NULL);
+ }
+
+ len1 ++;
+ len2 ++;
+ half = len1 >> 1;
+
+ /* initalize first row */
+ row = g_malloc (len2 * sizeof(int));
+ end = row + len2 - 1;
+ for (i = 0; i < len2; i++) {
+ row[i] = i;
+ }
+
+ /* in this case we don't have to scan two corner triangles (of size len1/2)
+ * in the matrix because no best path can go throught them. note this
+ * breaks when len1 == len2 == 2 so the memchr() special case above is
+ * necessary */
+ row[0] = len1 - half - 1;
+ for (i = 1; i < len1; i++) {
+ char1 = s1[i - 1];
+ /* skip the upper triangle */
+ if (i >= len1 - half) {
+ offset = i - (len1 - half);
+ char2p = s2 + offset;
+ p = row + offset;
+ c3 = *(p++) + (char1 != *(char2p++));
+ x = *p;
+ x++;
+ D = x;
+ if (x > c3)
+ x = c3;
+ *(p++) = x;
+ }
+ else {
+ p = row + 1;
+ char2p = s2;
+ D = x = i;
+ }
+ /* skip the lower triangle */
+ if (i <= half + 1)
+ end = row + len2 + i - half - 2;
+ /* main */
+ while (p <= end) {
+ c3 = --D + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ /* lower triangle sentinel */
+ if (i <= half) {
+ c3 = --D + (char1 != *char2p);
+ x++;
+ if (x > c3)
+ x = c3;
+ *p = x;
+ }
+ }
+
+ i = *end;
+ g_free (row);
+ return i;
+}
+
+/* Calculate fuzzy hash for specified string */
+fuzzy_hash_t *
+fuzzy_init (f_str_t *in, memory_pool_t *pool)
+{
+ fuzzy_hash_t *new;
+ int i, repeats = 0;
+ char *c = in->begin, last = '\0';
+
+ new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
+ bzero (&rs, sizeof (rs));
+ new->block_size = fuzzy_blocksize (in->len);
+
+ for (i = 0; i < in->len; i ++) {
+ if (*c == last) {
+ repeats ++;
+ }
+ else {
+ repeats = 0;
+ }
+ if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
+ fuzzy_update (new, *c);
+ }
+ last = *c;
+ c ++;
+ }
+
+ return new;
+}
+
+/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
+int
+fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2)
+{
+ int res, l1, l2;
+
+ /* If we have hashes of different size, input strings are too different */
+ if (h1->block_size != h2->block_size) {
+ return 100;
+ }
+
+ l1 = strlen (h1->hash_pipe);
+ l2 = strlen (h2->hash_pipe);
+ res = lev_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
+ res = (res * 100) / (l1 + l2);
+
+ return res;
+}
+
+/*
+ * vi:ts=4
+ */