Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing

13 år sedan · a3fa4d6723
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,8 +6,8 @@
 PROJECT(rspamd C)

 SET(RSPAMD_VERSION_MAJOR 0)
 SET(RSPAMD_VERSION_MINOR 3)
 SET(RSPAMD_VERSION_PATCH 14)
 SET(RSPAMD_VERSION_MINOR 4)
 SET(RSPAMD_VERSION_PATCH 0)


 SET(RSPAMD_VERSION         "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}")
--- a/src/binlog.c
+++ b/src/binlog.c
@@ -120,7 +120,7 @@ binlog_check_file (struct rspamd_binlog *log)
 		return FALSE;
 	}

 	log->cur_seq = log->cur_idx->last_index;
 	log->cur_seq = log->metaindex->last_index * BINLOG_IDX_LEN + log->cur_idx->last_index;
 	log->cur_time = log->cur_idx->indexes[log->cur_idx->last_index].time;

 	return TRUE;
@@ -425,6 +425,13 @@ binlog_sync (struct rspamd_binlog *log, guint64 from_rev, guint64 *from_time, GB
 		*rep = NULL;
 		return FALSE;
 	}
 	else if (from_rev > log->cur_seq) {
 		/* Slave has more actual copy, write this to log and abort sync */
 		msg_warn ("slave has more recent revision of statfile %s: %uL and our is: %uL", log->filename, from_rev, log->cur_seq);
 		*rep = NULL;
 		*from_time = 0;
 		return FALSE;
 	}

 	metaindex_num = from_rev / BINLOG_IDX_LEN;
 	/* First of all try to find this revision */
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -270,6 +270,8 @@ struct config_file {
 	gboolean log_extended;							/**< log extended information							*/

 	gsize max_statfile_size;						/**< maximum size for statfile							*/
 	guint32 statfile_sync_interval;					/**< synchronization interval							*/
 	guint32 statfile_sync_timeout;					/**< synchronization timeout							*/

 	struct memcached_server memcached_servers[MAX_MEMCACHED_SERVERS];	/**< memcached servers				*/
 	gsize memcached_servers_num;					/**< number of memcached servers						*/
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -172,6 +172,9 @@ init_defaults (struct config_file *cfg)
 	cfg->dns_throttling_errors = 20;
 	cfg->dns_throttling_time = 10000;

 	cfg->statfile_sync_interval = 60000;
 	cfg->statfile_sync_timeout = 20000;

 	cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
 	cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
 	cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);
--- a/src/cfg_xml.c
+++ b/src/cfg_xml.c
@@ -281,6 +281,18 @@ static struct xml_parser_rule grammar[] = {
 				G_STRUCT_OFFSET (struct config_file, filters_str),
 				NULL
 			},
 			{
 				"sync_interval",
 				xml_handle_seconds,
 				G_STRUCT_OFFSET (struct config_file, statfile_sync_interval),
 				NULL
 			},
 			{
 				"sync_timeout",
 				xml_handle_seconds,
 				G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout),
 				NULL
 			},
 			NULL_ATTR
 		},
 		NULL_DEF_ATTR
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -43,11 +43,11 @@ bayes_error_quark (void)
 }

 struct bayes_statfile_data {
 	double                          hits;
 	double                          total_hits;
 	guint64                         hits;
 	guint64                         total_hits;
 	double                          local_probability;
 	double                          post_probability;
 	double                          value;
 	guint                           value;
 	struct statfile                *st;
 	stat_file_t                    *file;
 };
@@ -67,25 +67,22 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data)
 {
 	token_node_t                   *node = key;
 	struct bayes_callback_data     *cd = data;
 	double                          v, c;
 	gint                            v, c;

 	c = (cd->in_class) ? 1 : -1;

 	/* Consider that not found blocks have value 1 */
 	v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
 	if (fabs (v) < ALPHA && c > 0) {
 	if (v == 0 && c > 0) {
 		statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
 	}
 	else {
 		if (G_LIKELY (c > 0 && c < G_MAXDOUBLE)) {
 			v += c;
 		if (G_LIKELY (c > 0)) {
 			v ++;
 		}
 		else if (c < 0){
 			if (v > -c) {
 				v -= c;
 			}
 			else {
 				v = 0;
 			if (v != 0) {
 				v --;
 			}
 		}
 		statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v);
@@ -103,14 +100,15 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)

 	token_node_t                   *node = key;
 	struct bayes_callback_data     *cd = data;
 	double                          local_hits = 0, renorm = 0;
 	int                             i;
 	double                          renorm = 0;
 	gint                            i;
 	guint64                         local_hits = 0;
 	struct bayes_statfile_data     *cur;

 	for (i = 0; i < cd->statfiles_num; i ++) {
 		cur = &cd->statfiles[i];
 		cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now);
 		if (cur->value > ALPHA) {
 		if (cur->value > 0) {
 			cur->total_hits += cur->value;
 			cur->hits = cur->value;
 			local_hits += cur->value;
@@ -121,7 +119,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
 	}
 	for (i = 0; i < cd->statfiles_num; i ++) {
 		cur = &cd->statfiles[i];
 		cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (local_hits + 1.0));
 		cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) /
 				(LOCAL_PROB_DENOM * (1.0 + local_hits));
 		renorm += cur->post_probability * cur->local_probability;
 	}

--- a/src/controller.c
+++ b/src/controller.c
@@ -212,7 +212,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct
 	struct statfile                *st;
 	gchar                           out_buf[BUFSIZ];
 	gint                            i;
 	guint64                         rev, ti, len, pos;
 	guint64                         rev, ti, len, pos, blocks;
 	gchar                           *out;
 	struct rspamd_binlog_element    log_elt;
 	struct stat_file_block         *stat_elt;
@@ -222,7 +222,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct
 	if (statfile == NULL) {
 		return FALSE;
 	}
 	

 	/* Begin to copy all blocks into array */
 	statfile_get_revision (statfile, &rev, (time_t *)&ti);
 	if (ti == 0) {
@@ -230,10 +230,13 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct
 		ti = time (NULL);
 		statfile_set_revision (statfile, rev, ti);
 	}
 	len = statfile->cur_section.length * sizeof (struct rspamd_binlog_element);
 	msg_info ("send a whole statfile %s with version %uL to slave", symbol, rev);

 	blocks = statfile_get_total_blocks (statfile);
 	len = blocks * sizeof (struct rspamd_binlog_element);
 	out = memory_pool_alloc (session->session_pool, len);

 	for (i = 0, pos = 0; i < statfile->cur_section.length; i ++) {
 	for (i = 0, pos = 0; i < blocks; i ++) {
 		stat_elt = (struct stat_file_block *)((u_char *)statfile->map + statfile->seek_pos + i * sizeof (struct stat_file_block));
 		if (fabs (stat_elt->value) > 0.001) {
 			/* Write only those values which value is not 0 */
@@ -324,6 +327,7 @@ process_sync_command (struct controller_session *session, gchar **args)
 	}
 	
 	while (binlog_sync (binlog, rev, &time, &data)) {
 		rev ++;
 		r = rspamd_snprintf (out_buf, sizeof (out_buf), "%uL %uL %z" CRLF, rev, time, data->len);
 		if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) {
 			if (data != NULL) {
@@ -339,7 +343,6 @@ process_sync_command (struct controller_session *session, gchar **args)
 				return FALSE;
 			}
 		}
 		rev ++;
 	}

 	if (time == 0) {
@@ -666,12 +669,6 @@ process_command (struct controller_command *cmd, gchar **cmd_args, struct contro
 			}
 			return TRUE;
 		}
 		else {
 			if (! rspamd_dispatcher_write (session->dispatcher, CRLF, sizeof (CRLF) - 1, FALSE, TRUE)) {
 				return FALSE;
 			}
 			return TRUE;
 		}
 		break;
 	case COMMAND_HELP:
 		r = rspamd_snprintf (out_buf, sizeof (out_buf),
@@ -851,7 +848,7 @@ controller_read_socket (f_str_t * in, void *arg)
 			c.begin = part->content->data;
 			c.len = part->content->len;
 			if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
 					session->session_pool, &c, &tokens, FALSE, part->is_utf)) {
 					session->session_pool, &c, &tokens, FALSE, part->is_utf, part->urls_offset)) {
 				i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END);
 				free_task (task, FALSE);
 				if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
 				c.len = strlen (cur->data);
 				if (c.len > 0) {
 					c.begin = cur->data;
 					if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
 					if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) {
 						msg_info ("cannot tokenize input");
 						return;
 					}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
 				c.begin = text_part->content->data;
 				c.len = text_part->content->len;
 				/* Tree would be freed at task pool freeing */
 				if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
 				if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) {
 					msg_info ("cannot tokenize input");
 					return;
 				}
@@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric)
 gboolean
 learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 {
 	GList                          *cur;
 	GList                          *cur, *ex;
 	struct classifier_config       *cl;
 	struct classifier_ctx          *cls_ctx;
 	gchar                          *s;
@@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 		if (s != NULL) {
 			c.len = strlen (cur->data);
 			c.begin = cur->data;
 			ex = NULL;
 		}
 		else {
 			part = cur->data;
@@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 			c.begin = part->content->data;
 			c.len = part->content->len;
 			is_utf = part->is_utf;
 			ex = part->urls_offset;
 		}
 		/* Get tokens */
 		if (!cl->tokenizer->tokenize_func (
 				cl->tokenizer, task->task_pool,
 				&c, &tokens, FALSE, is_utf)) {
 				&c, &tokens, FALSE, is_utf, ex)) {
 			g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
 			return FALSE;
 		}
--- a/src/fstring.c
+++ b/src/fstring.c
@@ -297,6 +297,34 @@ fstrgrow (memory_pool_t * pool, f_str_t * orig, size_t newlen)
 	return res;
 }

 static guint32
 fstrhash_c (gchar c, guint32 hval)
 {
 	guint32                         tmp;
 	/*
 	 * xor in the current byte against each byte of hval
 	 * (which alone gaurantees that every bit of input will have
 	 * an effect on the output)
 	 */
 	tmp = c & 0xFF;
 	tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24);
 	hval ^= tmp;

 	/* add some bits out of the middle as low order bits */
 	hval = hval + ((hval >> 12) & 0x0000ffff);

 	/* swap most and min significative bytes */
 	tmp = (hval << 24) | ((hval >> 24) & 0xff);
 	/* zero most and min significative bytes of hval */
 	hval &= 0x00ffff00;
 	hval |= tmp;
 	/*
 	 * rotate hval 3 bits to the left (thereby making the
 	 * 3rd msb of the above mess the hsb of the output hash)
 	 */
 	return (hval << 3) + (hval >> 29);
 }

 /*
 * Return hash value for a string
 */
@@ -305,7 +333,6 @@ fstrhash (f_str_t * str)
 {
 	size_t                          i;
 	guint32                         hval;
 	guint32                         tmp;
 	gchar                           *c = str->begin;

 	if (str == NULL) {
@@ -314,32 +341,54 @@ fstrhash (f_str_t * str)
 	hval = str->len;

 	for (i = 0; i < str->len; i++, c++) {
 		/* 
 		 * xor in the current byte against each byte of hval
 		 * (which alone gaurantees that every bit of input will have
 		 * an effect on the output)
 		 */
 		tmp = *c & 0xFF;
 		tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24);
 		hval ^= tmp;

 		/* add some bits out of the middle as low order bits */
 		hval = hval + ((hval >> 12) & 0x0000ffff);

 		/* swap most and min significative bytes */
 		tmp = (hval << 24) | ((hval >> 24) & 0xff);
 		/* zero most and min significative bytes of hval */
 		hval &= 0x00ffff00;
 		hval |= tmp;
 		/*
 		 * rotate hval 3 bits to the left (thereby making the
 		 * 3rd msb of the above mess the hsb of the output hash)
 		 */
 		hval = (hval << 3) + (hval >> 29);
 		hval = fstrhash_c (*c, hval);
 	}
 	return hval;
 }

 /*
 * Return hash value for a string
 */
 guint32
 fstrhash_lowercase (f_str_t * str, gboolean is_utf)
 {
 	gsize                           i;
 	guint32                         j, hval;
 	const gchar                    *p = str->begin, *end = NULL;
 	gchar                           t;
 	gunichar                        uc;

 	if (str == NULL) {
 		return 0;
 	}
 	hval = str->len;

 	if (is_utf) {
 		while (end < str->begin + str->len) {
 			g_utf8_validate (p, str->len, &end);
 			while (p < end) {
 				uc = g_unichar_tolower (g_utf8_get_char (p));
 				for (j = 0; j < sizeof (gunichar); j ++) {
 					t = (uc >> (j * 8)) & 0xff;
 					if (t != 0) {
 						hval = fstrhash_c (t, hval);
 					}
 				}
 				p = g_utf8_next_char (p);
 			}
 			p = end + 1;
 		}

 	}
 	else {
 		for (i = 0; i < str->len; i++, p++) {
 			hval = fstrhash_c (g_ascii_tolower (*p), hval);
 		}
 	}

 	return hval;
 }

 void
 fstrstrip (f_str_t * str)
 {
--- a/src/fstring.h
+++ b/src/fstring.h
@@ -93,6 +93,10 @@ f_str_t* fstrgrow (memory_pool_t *pool, f_str_t *orig, size_t newlen);
 */
 guint32 fstrhash (f_str_t *str);

 /*
 * Return fast hash value for fixed string converted to lowercase
 */
 guint32 fstrhash_lowercase (f_str_t *str, gboolean is_utf);
 /*
 * Make copy of string to 0-terminated string
 */
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -320,11 +320,11 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
 	gchar                          *c;
 	gsize                           real_len = 0, len = part->content->len;
 	GList                          *cur_offset;
 	struct uri                     *cur_url = NULL;
 	struct process_exception       *cur_ex = NULL;

 	cur_offset = part->urls_offset;
 	if (cur_offset != NULL) {
 		cur_url = cur_offset->data;
 		cur_ex = cur_offset->data;
 	}

 	c = part->content->data;
@@ -332,12 +332,12 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
 	new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
 	bzero (&rs, sizeof (rs));
 	for (i = 0; i < len;) {
 		if (cur_url != NULL && cur_url->pos == i) {
 			i += cur_url->len + 1;
 			c += cur_url->len + 1;
 		if (cur_ex != NULL && cur_ex->pos == i) {
 			i += cur_ex->len + 1;
 			c += cur_ex->len + 1;
 			cur_offset = g_list_next (cur_offset);
 			if (cur_offset != NULL) {
 				cur_url = cur_offset->data;
 				cur_ex = cur_offset->data;
 			}
 		}
 		else {
@@ -354,18 +354,18 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)

 	cur_offset = part->urls_offset;
 	if (cur_offset != NULL) {
 		cur_url = cur_offset->data;
 		cur_ex = cur_offset->data;
 	}

 	c = part->content->data;

 	for (i = 0; i < len;) {
 		if (cur_url != NULL && cur_url->pos == i) {
 			i += cur_url->len + 1;
 			c += cur_url->len + 1;
 		if (cur_ex != NULL && cur_ex->pos == i) {
 			i += cur_ex->len + 1;
 			c += cur_ex->len + 1;
 			cur_offset = g_list_next (cur_offset);
 			if (cur_offset != NULL) {
 				cur_url = cur_offset->data;
 				cur_ex = cur_offset->data;
 			}
 		}
 		else {
--- a/src/main.h
+++ b/src/main.h
@@ -113,6 +113,13 @@ struct save_point {
 	guint saved;											/**< how much time we have delayed processing		*/
 };

 /**
 * Structure to point exception in text from processing
 */
 struct process_exception {
 	gsize pos;
 	gsize len;
 };

 /**
 * Union that would be used for storing sockaddrs
--- a/src/mem_pool.c
+++ b/src/mem_pool.c
@@ -91,6 +91,7 @@ pool_chain_new (gsize size)
 	chain->pos = align_ptr (chain->begin, MEM_ALIGNMENT);
 	chain->next = NULL;
 	STAT_LOCK ();
 	mem_pool_stat->bytes_allocated += size;
 	mem_pool_stat->chunks_allocated++;
 	STAT_UNLOCK ();

@@ -135,6 +136,7 @@ pool_chain_new_shared (gsize size)
 	chain->next = NULL;
 	STAT_LOCK ();
 	mem_pool_stat->shared_chunks_allocated++;
 	mem_pool_stat->bytes_allocated += size;
 	STAT_UNLOCK ();

 	return chain;
@@ -225,16 +227,11 @@ memory_pool_alloc (memory_pool_t * pool, gsize size)
 			cur->next = new;
 			pool->cur_pool = new;
 			new->pos += size;
 			STAT_LOCK ();
 			mem_pool_stat->bytes_allocated += size;
 			STAT_UNLOCK ();

 			return new->begin;
 		}
 		tmp = align_ptr (cur->pos, MEM_ALIGNMENT);
 		cur->pos = tmp + size;
 		STAT_LOCK ();
 		mem_pool_stat->bytes_allocated += size;
 		STAT_UNLOCK ();
 		return tmp;
 	}
 	return NULL;
@@ -349,9 +346,6 @@ memory_pool_alloc_shared (memory_pool_t * pool, gsize size)
 		}
 		tmp = align_ptr (cur->pos, MEM_ALIGNMENT);
 		cur->pos = tmp + size;
 		STAT_LOCK ();
 		mem_pool_stat->bytes_allocated += size;
 		STAT_UNLOCK ();
 		return tmp;
 	}
 	return NULL;
@@ -506,20 +500,22 @@ memory_pool_delete (memory_pool_t * pool)
 	while (cur) {
 		tmp = cur;
 		cur = cur->next;
 		g_slice_free1 (tmp->len, tmp->begin);
 		g_slice_free (struct _pool_chain, tmp);
 		STAT_LOCK ();
 		mem_pool_stat->chunks_freed++;
 		mem_pool_stat->bytes_allocated -= tmp->len;
 		STAT_UNLOCK ();
 		g_slice_free1 (tmp->len, tmp->begin);
 		g_slice_free (struct _pool_chain, tmp);
 	}
 	/* Unmap shared memory */
 	while (cur_shared) {
 		tmp_shared = cur_shared;
 		cur_shared = cur_shared->next;
 		munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared));
 		STAT_LOCK ();
 		mem_pool_stat->chunks_freed++;
 		mem_pool_stat->bytes_allocated -= tmp->len;
 		STAT_UNLOCK ();
 		munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared));
 	}
 	if (pool->variables) {
 		g_hash_table_destroy (pool->variables);
--- a/src/message.c
+++ b/src/message.c
@@ -686,25 +686,6 @@ free_byte_array_callback (void *pointer)
 	g_byte_array_free (arr, TRUE);
 }

 static void
 detect_real_charset (struct worker_task *task, GByteArray * part_content, struct mime_text_part *text_part)
 {
 	/* First of all try to detect UTF symbols */
 	text_part->is_utf = FALSE;
 	/* At first decision try to validate a single character */
 	if (g_utf8_get_char_validated (part_content->data, part_content->len) != -1) {
 		/* Now validate the whole part */
 		if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
 			text_part->is_utf = TRUE;
 			text_part->real_charset = UTF8_CHARSET;
 			return;
 		}
 	}
 	
 	/* Now try to detect specific symbols from some charsets */
 	
 }

 static GByteArray              *
 convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeContentType * type, struct mime_text_part *text_part)
 {
@@ -726,6 +707,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC

 	if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) {
 		text_part->is_raw = FALSE;
 		text_part->is_utf = TRUE;
 		return part_content;
 	}

@@ -741,6 +723,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC
 	result_array->len = write_bytes;
 	memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_free, res_str);
 	text_part->is_raw = FALSE;
 	text_part->is_utf = TRUE;

 	return result_array;
 }
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -580,7 +580,6 @@ statfile_pool_get_block (statfile_pool_t * pool, stat_file_t * file, guint32 h1,
 	return 0;
 }

 #define RANDOM_EXPIRE G_MAXINT / CHAIN_LENGTH
 static void
 statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guint32 h1, guint32 h2, time_t t, double value, gboolean from_now)
 {
@@ -590,7 +589,6 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
 	u_char                         *c;
    double                          min = G_MAXDOUBLE;


 	if (from_now) {
 		file->access_time = t;
 	}
@@ -626,14 +624,10 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
 			return;
 		}
 		
 		/* Expire block if we have some random value that is lower than RANDOM_EXPIRE value */
 		if (g_random_int () < RANDOM_EXPIRE) {
 			to_expire = block;
 			break;
 		}
 		/* Expire block with minimum value otherwise */
 		if (block->value < min) {
 			to_expire = block;
 			min = block->value;
 		}
 		c += sizeof (struct stat_file_block);
 		block = (struct stat_file_block *)c;
--- a/src/statfile_sync.c
+++ b/src/statfile_sync.c
@@ -31,11 +31,6 @@
 #include "buffer.h"
 #include "statfile_sync.h"

 /* XXX: hardcoding this value is not very smart */
 #define MAX_SYNC_TIME 60
 #define IO_TIMEOUT 20


 enum rspamd_sync_state {
 	SYNC_STATE_GREETING,
 	SYNC_STATE_READ_LINE,
@@ -54,7 +49,9 @@ struct rspamd_sync_ctx {

 	struct timeval interval;
 	struct timeval io_tv;
 	gint                            sock;
 	gint sock;
 	guint32 timeout;
 	guint32 sync_interval;
 	enum rspamd_sync_state state;
 	gboolean is_busy;

@@ -141,9 +138,9 @@ parse_revision_line (struct rspamd_sync_ctx *ctx, f_str_t *in)
 			}
 		}
 	}
 	

 	/* Current value must be len value and its value must not be 0 */
 	return ((val == &ctx->new_len) && *val != 0);
 	return ((val == &ctx->new_len));
 }

 static                          gboolean
@@ -193,8 +190,10 @@ sync_read (f_str_t * in, void *arg)
 				return FALSE;
 			}
 			else if (ctx->state != SYNC_STATE_QUIT) {
 				ctx->state = SYNC_STATE_READ_REV;
 				rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len);
 				if (ctx->new_len > 0) {
 					ctx->state = SYNC_STATE_READ_REV;
 					rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len);
 				}
 			}
 			else {
 				/* Quit this session */
@@ -247,11 +246,13 @@ static void
 sync_timer_callback (gint fd, short what, void *ud)
 {
 	struct rspamd_sync_ctx *ctx = ud;
 	guint32 jittered_interval;
 	
 	/* Plan new event */
 	evtimer_del (&ctx->tm_ev);
 	ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2);
 	ctx->interval.tv_usec = 0;
 	/* Add some jittering for synchronization */
 	jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2);
 	msec_to_tv (jittered_interval, &ctx->interval);
 	evtimer_add (&ctx->tm_ev, &ctx->interval);
 	log_next_sync (ctx->st->symbol, ctx->interval.tv_sec);
 	
@@ -266,8 +267,7 @@ sync_timer_callback (gint fd, short what, void *ud)
 		return;
 	}
 	/* Now create and activate dispatcher */
 	ctx->io_tv.tv_sec = IO_TIMEOUT;
 	ctx->io_tv.tv_usec = 0;
 	msec_to_tv (ctx->timeout, &ctx->io_tv);
 	ctx->dispatcher = rspamd_create_dispatcher (ctx->sock, BUFFER_LINE, sync_read, NULL, sync_err, &ctx->io_tv, ctx);
 	
 	ctx->state = SYNC_STATE_GREETING;
@@ -278,17 +278,20 @@ sync_timer_callback (gint fd, short what, void *ud)
 }

 static gboolean
 add_statfile_watch (statfile_pool_t *pool, struct statfile *st)
 add_statfile_watch (statfile_pool_t *pool, struct statfile *st, struct config_file *cfg)
 {
 	struct rspamd_sync_ctx *ctx;
 	guint32 jittered_interval;
 	
 	if (st->binlog->master_addr.s_addr != INADDR_NONE &&
 			st->binlog->master_addr.s_addr != INADDR_ANY) {
 		ctx = memory_pool_alloc (pool->pool, sizeof (struct rspamd_sync_ctx));
 		ctx->st = st;
 		ctx->timeout = cfg->statfile_sync_timeout;
 		ctx->sync_interval = cfg->statfile_sync_interval;
 		/* Add some jittering for synchronization */
 		ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2);
 		ctx->interval.tv_usec = 0;
 		jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2);
 		msec_to_tv (jittered_interval, &ctx->interval);
 		/* Open statfile and attach it to pool */
 		if ((ctx->real_statfile = statfile_pool_is_open (pool, st->path)) == NULL) {
 			if ((ctx->real_statfile = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
@@ -331,7 +334,7 @@ start_statfile_sync (statfile_pool_t *pool, struct config_file *cfg)
 		while (l) {
 			st = l->data;
 			if (st->binlog != NULL && st->binlog->affinity == AFFINITY_SLAVE) {
 				if (!add_statfile_watch (pool, st)) {
 				if (!add_statfile_watch (pool, st, cfg)) {
 					return FALSE;
 				}
 			}
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -36,55 +36,56 @@ extern const int                primes[];

 int
 osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
 		gboolean save_token, gboolean is_utf)
 		gboolean save_token, gboolean is_utf, GList *exceptions)
 {
 	token_node_t                   *new = NULL;
 	f_str_t                         token = { NULL, 0, 0 }, *res;
 	uint32_t                        hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
 	int                             i;

 	/* First set all bytes of hashpipe to some common value */
 	for (i = 0; i < FEATURE_WINDOW_SIZE; i++) {
 		hashpipe[i] = 0xABCDEF;
 	}
 	f_str_t                         token = { NULL, 0, 0 };
 	guint32                         hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
 	gint                            i, k = 0, l;
 	gchar                          *res;

 	if (*tree == NULL) {
 		*tree = g_tree_new (token_node_compare_func);
 		memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree);
 	}

 	while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
 	while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
 		/* Skip small words */
 		if (is_utf) {
 			if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
 				continue;
 			}
 			l = g_utf8_strlen (token.begin, token.len);
 		}
 		else {
 			if (token.len < MIN_LEN) {
 				continue;
 			}
 			l = token.len;
 		}
 		if (l < MIN_LEN) {
 			token.begin = res;
 			continue;
 		}

 		/* Shift hashpipe */
 		for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
 			hashpipe[i] = hashpipe[i - 1];
 		}
 		hashpipe[0] = fstrhash (&token);
 		hashpipe[0] = fstrhash_lowercase (&token, is_utf);

 		for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
 			h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
 			h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
 			new = memory_pool_alloc0 (pool, sizeof (token_node_t));
 			new->h1 = h1;
 			new->h2 = h2;
 			if (save_token) {
 				new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token);
 			}
 		if (k > FEATURE_WINDOW_SIZE) {
 			for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
 				h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
 				h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
 				new = memory_pool_alloc0 (pool, sizeof (token_node_t));
 				new->h1 = h1;
 				new->h2 = h2;
 				if (save_token) {
 					new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token);
 				}

 			if (g_tree_lookup (*tree, new) == NULL) {
 				g_tree_insert (*tree, new, new);
 				if (g_tree_lookup (*tree, new) == NULL) {
 					g_tree_insert (*tree, new, new);
 				}
 			}
 		}
 		k ++;
 		token.begin = res;
 	}

 	return TRUE;
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -52,7 +52,7 @@ const gchar t_delimiters[255] = {
 		1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
 		1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
 		1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
 		0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
 		1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
 }

 /* Get next word from specified f_str_t buf */
 f_str_t                        *
 get_next_word (f_str_t * buf, f_str_t * token)
 gchar *
 get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions)
 {
 	size_t                          remain;
 	guchar                         *pos;
 	gsize                           remain, pos;
 	guchar                         *p;
 	struct process_exception	   *ex = NULL;

 	if (buf == NULL) {
 		return NULL;
 	}

 	if (*exceptions != NULL) {
 		ex = (*exceptions)->data;
 	}

 	if (token->begin == NULL) {
 		token->begin = buf->begin;
 		if (ex != NULL) {
 			if (ex->pos == 0) {
 				token->begin = buf->begin + ex->len;
 				token->len = ex->len;
 			}
 			else {
 				token->begin = buf->begin;
 				token->len = 0;
 			}
 		}
 		else {
 			token->begin = buf->begin;
 			token->len = 0;
 		}
 	}

 	token->begin = token->begin + token->len;
 	token->len = 0;

 	remain = buf->len - (token->begin - buf->begin);
 	if (remain <= 0) {
 		return NULL;
 	}
 	pos = token->begin;
 	pos = token->begin - buf->begin;
 	p = token->begin;
 	/* Skip non delimiters symbols */
 	while (remain > 0 && t_delimiters[*pos]) {
 		token->begin++;
 	do {
 		if (ex != NULL && ex->pos == pos) {
 			/* Go to the next exception */
 			*exceptions = g_list_next (*exceptions);
 			return p + ex->len + 1;
 		}
 		pos++;
 		p++;
 		remain--;
 	}
 	while (remain > 0 && !t_delimiters[*pos]) {
 	} while (remain > 0 && t_delimiters[*p]);

 	token->begin = p;

 	while (remain > 0 && !t_delimiters[*p]) {
 		if (ex != NULL && ex->pos == pos) {
 			*exceptions = g_list_next (*exceptions);
 			return p + ex->len + 1;
 		}
 		token->len++;
 		pos++;
 		remain--;
 		p ++;
 	}

 	if (token->len == 0) {
 	if (remain == 0) {
 		return NULL;
 	}

 	return token;
 	return p;
 }

 /* Struct to access gmime headers */
@@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
 		new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
 		subject.begin = task->subject;
 		subject.len = strlen (task->subject);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
 	}
 	if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
 		new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
 		subject.begin = (gchar *)sub;
 		subject.len = strlen (sub);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
 	}
 }

--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -15,17 +15,18 @@
 #define FEATURE_WINDOW_SIZE 5

 typedef struct token_node_s {
 	uint32_t h1;
 	uint32_t h2;
 	guint32 h1;
 	guint32 h2;
 	float value;
 	uintptr_t extra;
 } token_node_t;

 /* Common tokenizer structure */
 struct tokenizer {
 	char *name;
 	int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
 	f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
 	gchar *name;
 	gint (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input,
 			GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
 	gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions);
 };

 /* Compare two token nodes */
@@ -33,9 +34,10 @@ int token_node_compare_func (gconstpointer a, gconstpointer b);
 /* Get tokenizer structure by name or return NULL if this name is not found */
 struct tokenizer* get_tokenizer (char *name);
 /* Get next word from specified f_str_t buf */
 f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
 gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions);
 /* OSB tokenize function */
 int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
 int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input,
 		GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
 /* Common tokenizer for headers */
 int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
 /* Make tokens for a subject */
--- a/src/url.c
+++ b/src/url.c
@@ -1160,6 +1160,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
 	gint                            rc;
 	gchar                          *url_str = NULL, *url_start, *url_end;
 	struct uri                     *new;
 	struct process_exception       *ex;
 	gchar                          *p, *end, *begin;


@@ -1183,13 +1184,14 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
 			if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) {
 				if (url_str != NULL) {
 					new = memory_pool_alloc0 (pool, sizeof (struct uri));
 					ex = memory_pool_alloc0 (pool, sizeof (struct process_exception));
 					if (new != NULL) {
 						g_strstrip (url_str);
 						rc = parse_uri (new, url_str, pool);
 						if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) &&
 								new->hostlen > 0) {
 							new->pos = url_start - begin;
 							new->len = url_end - url_start;
 							ex->pos = url_start - begin;
 							ex->len = url_end - url_start;
 							if (new->protocol == PROTOCOL_MAILTO) {
 								if (!g_tree_lookup (task->emails, new)) {
 									g_tree_insert (task->emails, new, new);
@@ -1200,7 +1202,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
 									g_tree_insert (task->urls, new, new);
 								}
 							}
 							part->urls_offset = g_list_prepend (part->urls_offset, new);
 							part->urls_offset = g_list_prepend (part->urls_offset, ex);
 						}
 						else if (rc != URI_ERRNO_OK) {
 							msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
@@ -1256,10 +1258,10 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start,
 				*url_str = NULL;
 			}
 			if (start != NULL) {
 				*start = (gchar *)pos;
 				*start = (gchar *)m.m_begin;
 			}
 			if (fin != NULL) {
 				*fin = (gchar *)pos + m.m_len;
 				*fin = (gchar *)m.m_begin + m.m_len;
 			}
 			return TRUE;
 		}
--- a/src/url.h
+++ b/src/url.h
@@ -32,9 +32,6 @@ struct uri {

 	struct uri *phished_url;

 	gsize pos;
 	gsize len;

 	/* @protocollen should only be usable if @protocol is either
 	 * PROTOCOL_USER or an uri string should be composed. */
 	guint protocollen;