mirrors
/
rspamd
mirror of https://github.com/vstakhov/rspamd.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
							/*-
 * Copyright 2016 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * Common tokenization functions
 */

#include "rspamd.h"
#include "tokenizers.h"
#include "stat_internal.h"
#include "xxhash.h"

typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
		rspamd_ftok_t * token,
		GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);

const gchar t_delimiters[255] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
	1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
	1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0
};

gint
token_node_compare_func (gconstpointer a, gconstpointer b)
{
	const rspamd_token_t *aa = a, *bb = b;

	if (aa->datalen != bb->datalen) {
		return aa->datalen - bb->datalen;
	}

	return memcmp (aa->data, bb->data, aa->datalen);
}

/* Get next word from specified f_str_t buf */
static gboolean
rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
		gchar const **cur, rspamd_ftok_t * token,
		GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
{
	gsize remain, pos;
	const gchar *p;
	struct process_exception *ex = NULL;

	if (buf == NULL) {
		return FALSE;
	}

	g_assert (cur != NULL);

	if (exceptions != NULL && *exceptions != NULL) {
		ex = (*exceptions)->data;
	}

	if (token->begin == NULL || *cur == NULL) {
		if (ex != NULL) {
			if (ex->pos == 0) {
				token->begin = buf->begin + ex->len;
				token->len = ex->len;
			}
			else {
				token->begin = buf->begin;
				token->len = 0;
			}
		}
		else {
			token->begin = buf->begin;
			token->len = 0;
		}
		*cur = token->begin;
	}

	token->len = 0;

	pos = *cur - buf->begin;
	if (pos >= buf->len) {
		return FALSE;
	}

	remain = buf->len - pos;
	p = *cur;

	/* Skip non delimiters symbols */
	do {
		if (ex != NULL && ex->pos == pos) {
			/* Go to the next exception */
			*exceptions = g_list_next (*exceptions);
			*cur = p + ex->len;
			return TRUE;
		}
		pos++;
		p++;
		remain--;
	} while (remain > 0 && t_delimiters[(guchar)*p]);

	token->begin = p;

	while (remain > 0 && !t_delimiters[(guchar)*p]) {
		if (ex != NULL && ex->pos == pos) {
			*exceptions = g_list_next (*exceptions);
			*cur = p + ex->len;
			return TRUE;
		}
		token->len++;
		pos++;
		remain--;
		p++;
	}

	if (remain == 0) {
		return FALSE;
	}

	if (rl) {
		if (is_utf) {
			*rl = g_utf8_strlen (token->begin, token->len);
		}
		else {
			*rl = token->len;
		}
	}

	*cur = p;

	return TRUE;
}

static gboolean
rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
		gchar const **cur, rspamd_ftok_t * token,
		GList **exceptions, gboolean is_utf, gsize *rl,
		gboolean check_signature)
{
	gsize remain, pos, siglen = 0;
	const gchar *p, *next_p, *sig = NULL;
	gunichar uc;
	guint processed = 0;
	struct process_exception *ex = NULL;
	enum {
		skip_delimiters = 0,
		feed_token,
		skip_exception,
		process_signature
	} state = skip_delimiters;

	if (buf == NULL) {
		return FALSE;
	}

	if (exceptions != NULL && *exceptions != NULL) {
		ex = (*exceptions)->data;
	}

	g_assert (is_utf);
	g_assert (cur != NULL);

	if (*cur == NULL) {
		*cur = buf->begin;
	}

	token->len = 0;

	pos = *cur - buf->begin;
	if (pos >= buf->len) {
		return FALSE;
	}

	remain = buf->len - pos;
	p = *cur;
	token->begin = p;

	while (remain > 0) {
		uc = g_utf8_get_char (p);
		next_p = g_utf8_next_char (p);

		if (next_p - p > (gint)remain) {
			return FALSE;
		}

		switch (state) {
		case skip_delimiters:
			if (ex != NULL && p - buf->begin == (gint)ex->pos) {
				token->begin = "!!EX!!";
				token->len = sizeof ("!!EX!!") - 1;
				processed = token->len;
				state = skip_exception;
				continue;
			}
			else if (g_unichar_isgraph (uc)) {
				if (!g_unichar_ispunct (uc)) {
					state = feed_token;
					token->begin = p;
					continue;
				}
				else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
					sig = p;
					siglen = remain;
					state = process_signature;
					continue;
				}
			}
			break;
		case feed_token:
			if (ex != NULL && p - buf->begin == (gint)ex->pos) {
				goto set_token;
			}
			else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
				goto set_token;
			}
			processed ++;
			break;
		case skip_exception:
			*cur = p + ex->len;
			*exceptions = g_list_next (*exceptions);
			goto set_token;
			break;
		case process_signature:
			if (*p == '\r' || *p == '\n') {
				msg_debug ("signature found: %*s", (gint)siglen, sig);
				return FALSE;
			}
			else if (*p != ' ' && *p != '-' && *p != '_') {
				state = skip_delimiters;
				continue;
			}
			break;
		}

		remain -= next_p - p;
		p = next_p;
	}

set_token:
	if (rl) {
		*rl = processed;
	}

	if (token->len == 0) {
		token->len = p - token->begin;
		g_assert (token->len > 0);
		*cur = p;
	}

	return TRUE;
}

GArray *
rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
		struct rspamd_config *cfg, GList *exceptions, gboolean compat,
		guint64 *hash)
{
	rspamd_ftok_t token, buf;
	const gchar *pos = NULL;
	gsize l;
	GArray *res;
	GList *cur = exceptions;
	token_get_function func;
	guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
	guint64 hv = 0;
	XXH64_state_t *st;
	gboolean decay = FALSE;
	guint64 prob;

	if (text == NULL) {
		return NULL;
	}

	buf.begin = text;
	buf.len = len;
	token.begin = NULL;
	token.len = 0;

	if (compat || !is_utf) {
		func = rspamd_tokenizer_get_word_compat;
	}
	else {
		func = rspamd_tokenizer_get_word;
	}

	if (cfg != NULL) {
		min_len = cfg->min_word_len;
		max_len = cfg->max_word_len;
		word_decay = cfg->words_decay;
		initial_size = word_decay * 2;
	}

	res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
	st = XXH64_createState ();
	XXH64_reset (st, 0);

	while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
		if (l == 0 || (min_len > 0 && l < min_len) ||
					(max_len > 0 && l > max_len)) {
			token.begin = pos;
			continue;
		}

		if (!decay) {
			XXH64_update (st, token.begin, token.len);

			/* Check for decay */
			if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
				/* Start decay */
				gdouble decay_prob;

				decay = TRUE;
				hv = XXH64_digest (st);

				/* We assume that word is 6 symbols length in average */
				decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);

				if (decay_prob >= 1.0) {
					prob = G_MAXUINT64;
				}
				else {
					prob = decay_prob * G_MAXUINT64;
				}
			}
		}
		else {
			/* Decaying probability */
			/* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
			hv = 2862933555777941757ULL * hv + 3037000493ULL;

			if (hv > prob) {
				token.begin = pos;
				continue;
			}
		}

		g_array_append_val (res, token);
		token.begin = pos;
	}

	if (!decay) {
		hv = XXH64_digest (st);
	}

	if (hash) {
		*hash = hv;
	}

	XXH64_freeState (st);

	return res;
}

/*
 * vi:ts=4
 */