diff options
Diffstat (limited to 'src/fuzzy.c')
-rw-r--r-- | src/fuzzy.c | 273 |
1 files changed, 273 insertions, 0 deletions
diff --git a/src/fuzzy.c b/src/fuzzy.c new file mode 100644 index 000000000..08814eaa1 --- /dev/null +++ b/src/fuzzy.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "config.h" +#include "mem_pool.h" +#include "fstring.h" +#include "fuzzy.h" + +#define ROLL_WINDOW_SIZE 9 +#define MIN_FUZZY_BLOCK_SIZE 3 +#define HASH_INIT 0x28021967 + +struct roll_state { + uint32_t h[3]; + char window[ROLL_WINDOW_SIZE]; + int n; +}; + +static struct roll_state rs; + + +/* Rolling hash function based on Adler-32 checksum */ +static uint32_t +fuzzy_roll_hash (char c) +{ + /* Check window position */ + if (rs.n == ROLL_WINDOW_SIZE) { + rs.n = 0; + } + + rs.h[1] -= rs.h[0]; + rs.h[1] += ROLL_WINDOW_SIZE * c; + + rs.h[0] += c; + rs.h[0] -= rs.window[rs.n]; + + /* Save current symbol */ + rs.window[rs.n] = c; + rs.n ++; + + rs.h[2] <<= 5; + rs.h[2] ^= c; + + return rs.h[1] + rs.h[2] + rs.h[3]; +} + +/* A simple non-rolling hash, based on the FNV hash */ +static uint32_t +fuzzy_fnv_hash (char c, uint32_t hval) +{ + hval ^= c; + hval = hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24); + return hval; +} + +/* Calculate blocksize depending on length of input */ +static uint32_t +fuzzy_blocksize (uint32_t len) +{ + + if (len < MIN_FUZZY_BLOCK_SIZE) { + return MIN_FUZZY_BLOCK_SIZE; + } + return g_spaced_primes_closest (len / FUZZY_HASHLEN); +} + +/* Update hash with new symbol */ +void +fuzzy_update (fuzzy_hash_t *h, char c) +{ + h->rh = fuzzy_roll_hash (c); + h->h = fuzzy_fnv_hash(c, h->h); + + if (h->rh % h->block_size == (h->block_size - 1)) { + h->hash_pipe[h->hi] = h->h; + if (h->hi < FUZZY_HASHLEN - 1) { + h->h = HASH_INIT; + h->hi ++; + } + } +} + +/* + * Levenshtein distance between string1 and string2. + * + * Replace cost is normally 1, and 2 with nonzero xcost. + */ +static uint32_t +lev_distance (char *s1, int len1, char *s2, int len2) +{ + int i; + int *row; /* we only need to keep one row of costs */ + int *end; + int half, nx; + char *sx, *char2p, char1; + int *p, D, x, offset, c3; + + /* strip common prefix */ + while (len1 > 0 && len2 > 0 && *s1 == *s2) { + len1 --; + len2 --; + s1 ++; + s2 ++; + } + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && s1[len1-1] == s2[len2-1]) { + len1--; + len2--; + } + + /* catch trivial cases */ + if (len1 == 0) { + return len2; + } + + if (len2 == 0) { + return len1; + } + + /* make the inner cycle (i.e. string2) the longer one */ + if (len1 > len2) { + nx = len1; + sx = s1; + len1 = len2; + len2 = nx; + s1 = s2; + s2 = sx; + } + /* check len1 == 1 separately */ + if (len1 == 1) { + return len2 - (memchr (s2, *s1, len2) != NULL); + } + + len1 ++; + len2 ++; + half = len1 >> 1; + + /* initalize first row */ + row = g_malloc (len2 * sizeof(int)); + end = row + len2 - 1; + for (i = 0; i < len2; i++) { + row[i] = i; + } + + /* in this case we don't have to scan two corner triangles (of size len1/2) + * in the matrix because no best path can go throught them. note this + * breaks when len1 == len2 == 2 so the memchr() special case above is + * necessary */ + row[0] = len1 - half - 1; + for (i = 1; i < len1; i++) { + char1 = s1[i - 1]; + /* skip the upper triangle */ + if (i >= len1 - half) { + offset = i - (len1 - half); + char2p = s2 + offset; + p = row + offset; + c3 = *(p++) + (char1 != *(char2p++)); + x = *p; + x++; + D = x; + if (x > c3) + x = c3; + *(p++) = x; + } + else { + p = row + 1; + char2p = s2; + D = x = i; + } + /* skip the lower triangle */ + if (i <= half + 1) + end = row + len2 + i - half - 2; + /* main */ + while (p <= end) { + c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + /* lower triangle sentinel */ + if (i <= half) { + c3 = --D + (char1 != *char2p); + x++; + if (x > c3) + x = c3; + *p = x; + } + } + + i = *end; + g_free (row); + return i; +} + +/* Calculate fuzzy hash for specified string */ +fuzzy_hash_t * +fuzzy_init (f_str_t *in, memory_pool_t *pool) +{ + fuzzy_hash_t *new; + int i, repeats = 0; + char *c = in->begin, last = '\0'; + + new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); + bzero (&rs, sizeof (rs)); + new->block_size = fuzzy_blocksize (in->len); + + for (i = 0; i < in->len; i ++) { + if (*c == last) { + repeats ++; + } + else { + repeats = 0; + } + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { + fuzzy_update (new, *c); + } + last = *c; + c ++; + } + + return new; +} + +/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ +int +fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2) +{ + int res, l1, l2; + + /* If we have hashes of different size, input strings are too different */ + if (h1->block_size != h2->block_size) { + return 100; + } + + l1 = strlen (h1->hash_pipe); + l2 = strlen (h2->hash_pipe); + res = lev_distance (h1->hash_pipe, l1, h2->hash_pipe, l2); + res = (res * 100) / (l1 + l2); + + return res; +} + +/* + * vi:ts=4 + */ |