/*
 * Copyright (c) 2009, Rambler media
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


#include "config.h"
#include "mem_pool.h"
#include "fstring.h"
#include "fuzzy.h"
#include "message.h"
#include "url.h"
#include "main.h"

#define ROLL_WINDOW_SIZE 9
#define MIN_FUZZY_BLOCK_SIZE 3
#define HASH_INIT      0x28021967

static const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

struct roll_state {
	guint32                         h[3];
	gchar                           window[ROLL_WINDOW_SIZE];
	gint                            n;
};

static struct roll_state        rs;


/* Rolling hash function based on Adler-32 checksum */
static                          guint32
fuzzy_roll_hash (guint c)
{
	/* Check window position */
	if (rs.n == ROLL_WINDOW_SIZE) {
		rs.n = 0;
	}

	rs.h[1] -= rs.h[0];
	rs.h[1] += ROLL_WINDOW_SIZE * c;

	rs.h[0] += c;
	rs.h[0] -= rs.window[rs.n];

	/* Save current symbol */
	rs.window[rs.n] = c;
	rs.n++;

	rs.h[2] <<= 5;
	rs.h[2] ^= c;

	return rs.h[0] + rs.h[1] + rs.h[2];
}

/* A simple non-rolling hash, based on the FNV hash */
static                          guint32
fuzzy_fnv_hash (guint c, guint32 hval)
{
	hval ^= c;
	hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
	return hval;
}

/* Calculate blocksize depending on length of input */
static                          guint32
fuzzy_blocksize (guint32 len)
{
	guint32                     nlen = MIN_FUZZY_BLOCK_SIZE;

	while (nlen * (FUZZY_HASHLEN - 1) < len) {
		nlen *= 2;
	}
	return nlen;
}


/* Update hash with new symbol */
static void
fuzzy_update (fuzzy_hash_t * h, guint c)
{
	h->rh = fuzzy_roll_hash (c);
	h->h = fuzzy_fnv_hash (c, h->h);

	if (h->rh % h->block_size == (h->block_size - 1)) {
		h->hash_pipe[h->hi] = b64[h->h % 64];
		if (h->hi < FUZZY_HASHLEN - 2) {
			h->h = HASH_INIT;
			h->hi++;
		}
	}
}

static void
fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, guint c)
{
	h1->rh = fuzzy_roll_hash (c);
	h1->h = fuzzy_fnv_hash (c, h1->h);
	h2->rh = h1->rh;
	h2->h = fuzzy_fnv_hash (c, h2->h);

	if (h1->rh % h1->block_size == (h1->block_size - 1)) {
		h1->hash_pipe[h1->hi] = b64[h1->h % 64];
		if (h1->hi < FUZZY_HASHLEN - 2) {
			h1->h = HASH_INIT;
			h1->hi++;
		}
	}
	if (h2->rh % h2->block_size == (h2->block_size - 1)) {
		h2->hash_pipe[h2->hi] = b64[h2->h % 64];
		if (h2->hi < FUZZY_HASHLEN - 2) {
			h2->h = HASH_INIT;
			h2->hi++;
		}
	}
}

/*
 * Levenshtein distance between string1 and string2.
 *
 * Replace cost is normally 1, and 2 with nonzero xcost.
 */
guint32
lev_distance (gchar *s1, gint len1, gchar *s2, gint len2)
{
	gint                            i;
	gint                            *row;	/* we only need to keep one row of costs */
	gint                            *end;
	gint                            half, nx;
	gchar                           *sx, *char2p, char1;
	gint                            *p, D, x, offset, c3;

	/* strip common prefix */
	while (len1 > 0 && len2 > 0 && *s1 == *s2) {
		len1--;
		len2--;
		s1++;
		s2++;
	}

	/* strip common suffix */
	while (len1 > 0 && len2 > 0 && s1[len1 - 1] == s2[len2 - 1]) {
		len1--;
		len2--;
	}

	/* catch trivial cases */
	if (len1 == 0) {
		return len2;
	}

	if (len2 == 0) {
		return len1;
	}

	/* make the inner cycle (i.e. string2) the longer one */
	if (len1 > len2) {
		nx = len1;
		sx = s1;
		len1 = len2;
		len2 = nx;
		s1 = s2;
		s2 = sx;
	}
	/* check len1 == 1 separately */
	if (len1 == 1) {
		return len2 - (memchr (s2, *s1, len2) != NULL);
	}

	len1++;
	len2++;
	half = len1 >> 1;

	/* initalize first row */
	row = g_malloc (len2 * sizeof (gint));
	end = row + len2 - 1;
	for (i = 0; i < len2; i++) {
		row[i] = i;
	}

	/* in this case we don't have to scan two corner triangles (of size len1/2)
	 * in the matrix because no best path can go throught them. note this
	 * breaks when len1 == len2 == 2 so the memchr() special case above is
	 * necessary */
	row[0] = len1 - half - 1;
	for (i = 1; i < len1; i++) {
		char1 = s1[i - 1];
		/* skip the upper triangle */
		if (i >= len1 - half) {
			offset = i - (len1 - half);
			char2p = s2 + offset;
			p = row + offset;
			c3 = *(p++) + (char1 != *(char2p++));
			x = *p;
			x++;
			D = x;
			if (x > c3)
				x = c3;
			*(p++) = x;
		}
		else {
			p = row + 1;
			char2p = s2;
			D = x = i;
		}
		/* skip the lower triangle */
		if (i <= half + 1)
			end = row + len2 + i - half - 2;
		/* main */
		while (p <= end) {
			c3 = --D + (char1 != *(char2p++));
			x++;
			if (x > c3)
				x = c3;
			D = *p;
			D++;
			if (x > D)
				x = D;
			*(p++) = x;
		}
		/* lower triangle sentinel */
		if (i <= half) {
			c3 = --D + (char1 != *char2p);
			x++;
			if (x > c3)
				x = c3;
			*p = x;
		}
	}

	i = *end;
	g_free (row);
	return i;
}

/* Calculate fuzzy hash for specified string */
fuzzy_hash_t                   *
fuzzy_init (f_str_t * in, memory_pool_t * pool)
{
	fuzzy_hash_t                   *new;
	guint                           i, repeats = 0;
	gchar                          *c = in->begin, last = '\0';
	gsize                           real_len = 0;

	new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
	bzero (&rs, sizeof (rs));
	for (i = 0; i < in->len; i++) {
		if (*c == last) {
			repeats++;
		}
		else {
			repeats = 0;
		}
		if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
			real_len ++;
		}
		last = *c;
		c++;
	}

	new->block_size = fuzzy_blocksize (real_len);
	c = in->begin;

	for (i = 0; i < in->len; i++) {
		if (*c == last) {
			repeats++;
		}
		else {
			repeats = 0;
		}
		if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
			fuzzy_update (new, *c);
		}
		last = *c;
		c++;
	}

	/* Check whether we have more bytes in a rolling window */
	if (new->rh != 0) {
		new->hash_pipe[new->hi] = b64[new->h % 64];
	}

	return new;
}

fuzzy_hash_t                   *
fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool)
{
	f_str_t                         f;

	f.begin = (gchar *)in->data;
	f.len = in->len;

	return fuzzy_init (&f, pool);
}

void
fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff)
{
	fuzzy_hash_t                   *new, *new2;
	gchar                          *c, *end, *begin;
	gsize                           real_len = 0, len = part->content->len;
	GList                          *cur_offset;
	struct process_exception       *cur_ex = NULL;
	gunichar                        uc;
	gboolean                        write_diff = FALSE;

	cur_offset = part->urls_offset;
	if (cur_offset != NULL) {
		cur_ex = cur_offset->data;
	}

	begin = (gchar *)part->content->data;
	c = begin;
	new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
	new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
	bzero (&rs, sizeof (rs));
	end = c + len;

	if (part->is_utf) {
		while (c < end) {
			if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
				c += cur_ex->len + 1;
				cur_offset = g_list_next (cur_offset);
				if (cur_offset != NULL) {
					cur_ex = cur_offset->data;
				}
			}
			else {
				uc = g_utf8_get_char (c);
				if (g_unichar_isalnum (uc)) {
					real_len ++;
				}
				c = g_utf8_next_char (c);
			}
		}
	}
	else {
		while (c < end) {
			if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
				c += cur_ex->len + 1;
				cur_offset = g_list_next (cur_offset);
				if (cur_offset != NULL) {
					cur_ex = cur_offset->data;
				}
			}
			else {
				if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
					real_len ++;
				}
				c++;
			}
		}
	}

	write_diff = real_len > 0 && real_len < max_diff;

	if (write_diff) {
		part->diff_str = fstralloc (pool, real_len);
	}
	else {
		part->diff_str = NULL;
	}

	new->block_size = fuzzy_blocksize (real_len);
	new2->block_size = new->block_size * 2;

	cur_offset = part->urls_offset;
	if (cur_offset != NULL) {
		cur_ex = cur_offset->data;
	}

	begin = (gchar *)part->content->data;
	c = begin;
	end = c + len;
	if (part->is_utf) {

		while (c < end) {
			if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
				c += cur_ex->len + 1;
				cur_offset = g_list_next (cur_offset);
				if (cur_offset != NULL) {
					cur_ex = cur_offset->data;
				}
			}
			else {
				uc = g_utf8_get_char (c);
				if (g_unichar_isalnum (uc)) {
					fuzzy_update2 (new, new2, uc);
					if (write_diff) {
						fstrpush_unichar (part->diff_str, uc);
					}
				}
				c = g_utf8_next_char (c);
			}
		}
	}
	else {
		while (c < end) {
			if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
				c += cur_ex->len + 1;
				cur_offset = g_list_next (cur_offset);
				if (cur_offset != NULL) {
					cur_ex = cur_offset->data;
				}
			}
			else {
				if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
					fuzzy_update2 (new, new2, *c);
					if (write_diff) {
						fstrpush (part->diff_str, *c);
					}
				}
				c++;
			}
		}
	}

	/* Check whether we have more bytes in a rolling window */
	if (new->rh != 0) {
		new->hash_pipe[new->hi] = b64[new->h % 64];
	}
	if (new2->rh != 0) {
		new2->hash_pipe[new2->hi] = b64[new2->h % 64];
	}

	part->fuzzy = new;
	part->double_fuzzy = new2;
}

/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
gint
fuzzy_compare_hashes (fuzzy_hash_t * h1, fuzzy_hash_t * h2)
{
	gint                            res, l1, l2;

	/* If we have hashes of different size, input strings are too different */
	if (h1->block_size != h2->block_size) {
		return 0;
	}

	l1 = strlen (h1->hash_pipe);
	l2 = strlen (h2->hash_pipe);

	if (l1 == 0 || l2 == 0) {
		if (l1 == 0 && l2 == 0) {
			return 100;
		}
		else {
			return 0;
		}
	}

	res = lev_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
	res = 100 - (2 * res * 100) / (l1 + l2);

	return res;
}

gint
fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2)
{
	if (p1->fuzzy != NULL && p2->fuzzy != NULL) {
		if (p1->fuzzy->block_size == p2->fuzzy->block_size) {
			return fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy);
		}
		else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) {
			return fuzzy_compare_hashes (p1->double_fuzzy, p2->fuzzy);
		}
		else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) {
			return fuzzy_compare_hashes (p2->double_fuzzy, p1->fuzzy);
		}
	}

	return 0;
}

/* 
 * vi:ts=4 
 */