/*-
 * Copyright 2016 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "shingles.h"
#include "fstring.h"
#include "cryptobox.h"

#define SHINGLES_WINDOW 3

struct rspamd_shingle*
rspamd_shingles_generate (GArray *input,
		const guchar key[16],
		rspamd_mempool_t *pool,
		rspamd_shingles_filter filter,
		gpointer filterd)
{
	struct rspamd_shingle *res;
	GArray *hashes[RSPAMD_SHINGLE_SIZE];
	rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
	guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
	const guchar *cur_key;
	rspamd_fstring_t *row;
	rspamd_ftok_t *word;
	rspamd_cryptobox_hash_state_t bs;
	guint64 val;
	gint i, j, beg = 0;

	if (pool != NULL) {
		res = rspamd_mempool_alloc (pool, sizeof (*res));
	}
	else {
		res = g_malloc (sizeof (*res));
	}

	rspamd_cryptobox_hash_init (&bs, NULL, 0);
	row = rspamd_fstring_sized_new (256);
	cur_key = key;
	out_key = (guchar *)&keys[0];

	/* Init hashes pipes and keys */
	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64),
				input->len + SHINGLES_WINDOW);
		/*
		 * To generate a set of hashes we just apply sha256 to the
		 * initial key as many times as many hashes are required and
		 * xor left and right parts of sha256 to get a single 16 bytes SIP key.
		 */
		rspamd_cryptobox_hash_update (&bs, cur_key, 16);
		rspamd_cryptobox_hash_final (&bs, shabuf);

		for (j = 0; j < 16; j ++) {
			out_key[j] = shabuf[j];
		}

		rspamd_cryptobox_hash_init (&bs, NULL, 0);
		cur_key = out_key;
		out_key += 16;
	}

	/* Now parse input words into a vector of hashes using rolling window */
	for (i = 0; i <= (gint)input->len; i ++) {
		if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
			for (j = beg; j < i; j ++) {
				word = &g_array_index (input, rspamd_ftok_t, j);
				row = rspamd_fstring_append (row, word->begin, word->len);
			}
			beg++;

			/* Now we need to create a new row here */
			for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
				rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
						keys[j]);
				g_array_append_val (hashes[j], val);
			}

			row = rspamd_fstring_assign (row, "", 0);
		}
	}

	/* Now we need to filter all hashes and make a shingles result */
	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len,
				i, key, filterd);
		g_array_free (hashes[i], TRUE);
	}

	rspamd_fstring_free (row);

	return res;
}


guint64
rspamd_shingles_default_filter (guint64 *input, gsize count,
		gint shno, const guchar *key, gpointer ud)
{
	guint64 minimal = G_MAXUINT64;
	gsize i;

	for (i = 0; i < count; i ++) {
		if (minimal > input[i]) {
			minimal = input[i];
		}
	}

	return minimal;
}


gdouble rspamd_shingles_compare (const struct rspamd_shingle *a,
		const struct rspamd_shingle *b)
{
	gint i, common = 0;

	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		if (a->hashes[i] == b->hashes[i]) {
			common ++;
		}
	}

	return (gdouble)common / (gdouble)RSPAMD_SHINGLE_SIZE;
}